comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/shared/ReadStats.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package shared;
2
3 import java.util.ArrayList;
4 import java.util.Arrays;
5 import java.util.Locale;
6
7 import align2.QualityTools;
8 import dna.AminoAcid;
9 import fileIO.ByteStreamWriter;
10 import fileIO.TextStreamWriter;
11 import stream.Read;
12 import stream.SamLine;
13 import structures.ByteBuilder;
14 import structures.EntropyTracker;
15 import structures.LongList;
16 import structures.SuperLongList;
17
18
19 /**
20 * @author Brian Bushnell
21 * @date Mar 18, 2013
22 *
23 */
24 public class ReadStats {
25
26 public ReadStats(){this(true);}
27
28 public ReadStats(boolean addToList){
29 if(addToList){
30 synchronized(ReadStats.class){
31 objectList.add(this);
32 }
33 }
34
35 if(COLLECT_QUALITY_STATS){
36 aqualArray=new long[2][127];
37 qualLength=new long[2][MAXLEN];
38 qualSum=new long[2][MAXLEN];
39 qualSumDouble=new double[2][MAXLEN];
40 bqualHistOverall=new long[127];
41 }else{
42 aqualArray=null;
43 qualLength=null;
44 qualSum=null;
45 qualSumDouble=null;
46 bqualHistOverall=null;
47 }
48
49 if(BQUAL_HIST_FILE!=null){
50 bqualHist=new long[2][MAXLEN][127];
51 }else{
52 bqualHist=null;
53 }
54
55 if(QUAL_COUNT_HIST_FILE!=null){
56 qcountHist=new long[2][127];
57 }else{
58 qcountHist=null;
59 }
60
61 if(COLLECT_MATCH_STATS){
62 matchSum=new long[2][MAXLEN];
63 delSum=new long[2][MAXLEN];
64 insSum=new long[2][MAXLEN];
65 subSum=new long[2][MAXLEN];
66 nSum=new long[2][MAXLEN];
67 clipSum=new long[2][MAXLEN];
68 otherSum=new long[2][MAXLEN];
69 }else{
70 matchSum=null;
71 delSum=null;
72 insSum=null;
73 subSum=null;
74 nSum=null;
75 clipSum=null;
76 otherSum=null;
77 }
78
79 if(COLLECT_QUALITY_ACCURACY){
80 qualMatch=new long[99];
81 qualSub=new long[99];
82 qualIns=new long[99];
83 qualDel=new long[99];
84 }else{
85 qualMatch=null;
86 qualSub=null;
87 qualIns=null;
88 qualDel=null;
89 }
90
91 if(COLLECT_INSERT_STATS){
92 insertHist=new LongList(MAXLEN);
93 }else{
94 insertHist=null;
95 }
96
97 if(COLLECT_BASE_STATS){
98 baseHist=new LongList[2][5];
99 for(int i=0; i<baseHist.length; i++){
100 for(int j=0; j<baseHist[i].length; j++){
101 baseHist[i][j]=new LongList(400);
102 }
103 }
104 }else{
105 baseHist=null;
106 }
107
108
109 if(COLLECT_INDEL_STATS){
110 insHist=new LongList(100);
111 delHist=new LongList(100);
112 delHist2=new LongList(100);
113 }else{
114 insHist=null;
115 delHist=null;
116 delHist2=null;
117 }
118
119 if(COLLECT_GC_STATS){
120 gcHist=new long[GC_BINS+1];
121 }else{
122 gcHist=null;
123 }
124
125 if(COLLECT_ENTROPY_STATS){
126 entropyHist=new long[ENTROPY_BINS+1];
127 eTracker=new EntropyTracker(Shared.AMINO_IN, 0, true);
128 }else{
129 entropyHist=null;
130 eTracker=null;
131 }
132
133 if(COLLECT_ERROR_STATS){
134 errorHist=new LongList(100);
135 }else{
136 errorHist=null;
137 }
138
139 if(COLLECT_LENGTH_STATS){
140 lengthHist=new SuperLongList(20000);
141 }else{
142 lengthHist=null;
143 }
144
145 if(COLLECT_IDENTITY_STATS){
146 idHist=new long[ID_BINS+1];
147 idBaseHist=new long[ID_BINS+1];
148 }else{
149 idHist=null;
150 idBaseHist=null;
151 }
152
153 if(COLLECT_TIME_STATS){
154 timeHist=new LongList(1001);
155 }else{
156 timeHist=null;
157 }
158
159 }
160
161 public static ReadStats mergeAll(){
162 if(objectList==null || objectList.isEmpty()){return merged=null;}
163 if(objectList.size()==1){return merged=objectList.get(0);}
164
165 ReadStats x=new ReadStats(false);
166 for(ReadStats rs : objectList){
167 x.read2Count+=rs.read2Count;
168 if(COLLECT_QUALITY_STATS){
169 for(int i=0; i<MAXLEN; i++){
170 x.qualLength[0][i]+=rs.qualLength[0][i];
171 x.qualLength[1][i]+=rs.qualLength[1][i];
172 x.qualSum[0][i]+=rs.qualSum[0][i];
173 x.qualSum[1][i]+=rs.qualSum[1][i];
174 x.qualSumDouble[0][i]+=rs.qualSumDouble[0][i];
175 x.qualSumDouble[1][i]+=rs.qualSumDouble[1][i];
176 }
177 for(int i=0; i<x.aqualArray[0].length; i++){
178 x.aqualArray[0][i]+=rs.aqualArray[0][i];
179 x.aqualArray[1][i]+=rs.aqualArray[1][i];
180 }
181 for(int i=0; i<x.bqualHistOverall.length; i++){
182 x.bqualHistOverall[i]+=rs.bqualHistOverall[i];
183 }
184 if(BQUAL_HIST_FILE!=null){
185 for(int i=0; i<x.bqualHist.length; i++){
186 for(int j=0; j<x.bqualHist[i].length; j++){
187 for(int k=0; k<x.bqualHist[i][j].length; k++){
188 x.bqualHist[i][j][k]+=rs.bqualHist[i][j][k];
189 }
190 }
191 }
192 }
193 if(QUAL_COUNT_HIST_FILE!=null){
194 for(int i=0; i<x.qcountHist.length; i++){
195 for(int j=0; j<x.qcountHist[i].length; j++){
196 x.qcountHist[i][j]+=rs.qcountHist[i][j];
197 }
198 }
199 }
200 }
201
202 if(COLLECT_MATCH_STATS){
203 for(int i=0; i<MAXLEN; i++){
204 x.matchSum[0][i]+=rs.matchSum[0][i];
205 x.matchSum[1][i]+=rs.matchSum[1][i];
206 x.delSum[0][i]+=rs.delSum[0][i];
207 x.delSum[1][i]+=rs.delSum[1][i];
208 x.insSum[0][i]+=rs.insSum[0][i];
209 x.insSum[1][i]+=rs.insSum[1][i];
210 x.subSum[0][i]+=rs.subSum[0][i];
211 x.subSum[1][i]+=rs.subSum[1][i];
212 x.nSum[0][i]+=rs.nSum[0][i];
213 x.nSum[1][i]+=rs.nSum[1][i];
214 x.clipSum[0][i]+=rs.clipSum[0][i];
215 x.clipSum[1][i]+=rs.clipSum[1][i];
216 x.otherSum[0][i]+=rs.otherSum[0][i];
217 x.otherSum[1][i]+=rs.otherSum[1][i];
218 }
219 }
220 if(COLLECT_INSERT_STATS){
221 x.insertHist.incrementBy(rs.insertHist);
222 x.pairedCount+=rs.pairedCount;
223 x.unpairedCount+=rs.unpairedCount;
224 }
225 if(COLLECT_BASE_STATS){
226 for(int i=0; i<rs.baseHist.length; i++){
227 for(int j=0; j<rs.baseHist[i].length; j++){
228 x.baseHist[i][j].incrementBy(rs.baseHist[i][j]);
229 }
230 }
231 }
232 if(COLLECT_QUALITY_ACCURACY){
233 for(int i=0; i<x.qualMatch.length; i++){
234 x.qualMatch[i]+=rs.qualMatch[i];
235 x.qualSub[i]+=rs.qualSub[i];
236 x.qualIns[i]+=rs.qualIns[i];
237 x.qualDel[i]+=rs.qualDel[i];
238 }
239 }
240
241
242 if(COLLECT_INDEL_STATS){
243 x.delHist.incrementBy(rs.delHist);
244 x.delHist2.incrementBy(rs.delHist2);
245 x.insHist.incrementBy(rs.insHist);
246 }
247
248 if(COLLECT_LENGTH_STATS){
249 x.lengthHist.incrementBy(rs.lengthHist);
250 }
251
252
253 if(COLLECT_ERROR_STATS){
254 x.errorHist.incrementBy(rs.errorHist);
255 }
256
257 if(COLLECT_GC_STATS){
258 for(int i=0; i<rs.gcHist.length; i++){
259 x.gcHist[i]+=rs.gcHist[i];
260 }
261 }
262
263 if(COLLECT_ENTROPY_STATS){
264 for(int i=0; i<rs.entropyHist.length; i++){
265 x.entropyHist[i]+=rs.entropyHist[i];
266 }
267 }
268
269 if(COLLECT_IDENTITY_STATS){
270 for(int i=0; i<rs.idHist.length; i++){
271 x.idHist[i]+=rs.idHist[i];
272 x.idBaseHist[i]+=rs.idBaseHist[i];
273 }
274 }
275
276 if(COLLECT_TIME_STATS){
277 x.timeHist.incrementBy(rs.timeHist);
278 }
279
280 x.gcMaxReadLen=Tools.max(x.gcMaxReadLen, rs.gcMaxReadLen);
281 x.idMaxReadLen=Tools.max(x.idMaxReadLen, rs.idMaxReadLen);
282 }
283
284 merged=x;
285 return x;
286 }
287
288 public void addToQualityHistogram(final Read r){
289 if(r==null){return;}
290 addToQualityHistogram2(r);
291 if(r.mate!=null){addToQualityHistogram2(r.mate);}
292 }
293
294 private void addToQualityHistogram2(final Read r){
295 final int pairnum=r.samline==null ? r.pairnum() : r.samline.pairnum();
296 if(r==null || r.quality==null || r.quality.length<1){return;}
297 byte[] quals=r.quality, bases=r.bases;
298 final Object obj=r.obj;
299 if(obj!=null && obj.getClass()==TrimRead.class){
300 quals=(pairnum==0 ? ((TrimRead)obj).qual1 : ((TrimRead)obj).qual2);
301 bases=(pairnum==0 ? ((TrimRead)obj).bases1 : ((TrimRead)obj).bases2);
302 }
303 if(pairnum==1){read2Count++;}
304 addToQualityHistogram(quals, pairnum);
305 int x=Read.avgQualityByProbabilityInt(bases, quals, true, 0);
306 aqualArray[pairnum][x]++;
307 if(BQUAL_HIST_FILE!=null){
308 addToBQualityHistogram(quals, pairnum);
309 }
310 if(QUAL_COUNT_HIST_FILE!=null){
311 addToQCountHistogram(quals, pairnum);
312 }
313 }
314
315 public void addToQualityHistogram(byte[] qual, int pairnum){
316 if(qual==null || qual.length<1){return;}
317 final int limit=Tools.min(qual.length, MAXLEN);
318 final long[] ql=qualLength[pairnum], qs=qualSum[pairnum];
319 final double[] qsd=qualSumDouble[pairnum];
320 ql[limit-1]++;
321 for(int i=0; i<limit; i++){
322 qs[i]+=qual[i];
323 qsd[i]+=QualityTools.PROB_ERROR[qual[i]];
324 }
325 for(byte q : qual){
326 bqualHistOverall[q]++;
327 }
328 }
329
330 private void addToBQualityHistogram(byte[] qual, int pairnum){
331 if(qual==null || qual.length<1){return;}
332 final int limit=Tools.min(qual.length, MAXLEN);
333 final long[][] bqh=bqualHist[pairnum];
334 for(int i=0; i<limit; i++){
335 bqh[i][qual[i]]++;
336 }
337 }
338
339 private void addToQCountHistogram(byte[] qual, int pairnum){
340 if(qual==null || qual.length<1){return;}
341 final long[] qch=qcountHist[pairnum];
342 for(byte q : qual){
343 qch[q]++;
344 }
345 }
346
347 public void addToQualityAccuracy(final Read r){
348 if(r==null){return;}
349 addToQualityAccuracy(r, 0);
350 if(r.mate!=null){addToQualityAccuracy(r.mate, 1);}
351 }
352
353 public void addToQualityAccuracy(final Read r, int pairnum){
354 if(r==null || r.quality==null || r.quality.length<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
355 final byte[] bases=r.bases;
356 final byte[] qual=r.quality;
357 byte[] match=r.match;
358
359 if(r.shortmatch()){match=Read.toLongMatchString(match);}
360
361 final boolean plus=(r.strand()==0);
362 int bpos=0;
363 byte lastm='A';
364 for(int mpos=0; mpos<match.length/* && rpos<limit*/; mpos++){
365 byte b=bases[bpos];
366 byte q=qual[bpos];
367 byte m=match[plus ? mpos : match.length-mpos-1];
368
369 {
370 if(m=='m'){
371 qualMatch[q]++;
372 }else if(m=='S'){
373 qualSub[q]++;
374 }else if(m=='I'){
375 if(AminoAcid.isFullyDefined(b)){qualIns[q]++;}
376 }else if(m=='N'){
377 //do nothing
378 }else if(m=='C'){
379 //do nothing
380 }else if(m=='V' || m=='i'){
381 //do nothing
382 }else if(m=='D'){
383 if(lastm!=m){
384 int x=bpos, y=bpos-1;
385 if(x<qual.length){
386 if(AminoAcid.isFullyDefined(bases[x])){
387 qualDel[qual[x]]++;
388 }
389 }
390 if(y>=0){
391 if(AminoAcid.isFullyDefined(bases[y])){
392 qualDel[qual[y]]++;
393 }
394 }
395 }
396 bpos--;
397 }else if(m=='d'){
398 bpos--;
399 }else{
400 assert(!Tools.isDigit(m)) : ((char)m);
401 }
402 }
403
404 bpos++;
405 lastm=m;
406 }
407
408 }
409
410 public void addToErrorHistogram(final Read r){
411 if(r==null){return;}
412 addToErrorHistogram(r, 0);
413 if(r.mate!=null){addToErrorHistogram(r.mate, 1);}
414 }
415
416 private void addToErrorHistogram(final Read r, int pairnum){
417 if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
418 r.toLongMatchString(false);
419 int x=r.countSubs();
420 errorHist.increment(x, 1);
421 }
422
423 public void addToLengthHistogram(final Read r){
424 if(r==null){return;}
425 addToLengthHistogram(r, 0);
426 if(r.mate!=null){addToLengthHistogram(r.mate, 1);}
427 }
428
429 private void addToLengthHistogram(final Read r, int pairnum){
430 if(r==null || r.bases==null){return;}
431 int x=r.length();//Tools.min(r.length(), MAXLENGTHLEN); Old style before SLL
432 lengthHist.increment(x, 1);
433 }
434
435 public void addToGCHistogram(final Read r1){
436 if(r1==null){return;}
437 final Read r2=r1.mate;
438 final int len1=r1.length(), len2=r1.mateLength();
439
440 final float gc1=(len1>0 ? r1.gc() : -1);
441 final float gc2=(len2>0 ? r2.gc() : -1);
442 if(usePairGC){
443 final float gc;
444 if(r2==null){
445 gc=gc1;
446 }else{
447 gc=(gc1*len1+gc2*len2)/(len1+len2);
448 }
449 addToGCHistogram(gc, len1+len2);
450 }else{
451 addToGCHistogram(gc1, len1);
452 addToGCHistogram(gc2, len2);
453 }
454 }
455
456 private void addToGCHistogram(final float gc, final int len){
457 if(gc<0 || len<1){return;}
458 gcHist[Tools.min(GC_BINS, (int)(gc*(GC_BINS+1)))]++;
459 gcMaxReadLen=Tools.max(len, gcMaxReadLen);
460 }
461
462 public void addToEntropyHistogram(final Read r1){
463 if(r1==null){return;}
464 final Read r2=r1.mate;
465 final int len1=r1.length(), len2=r1.mateLength();
466
467 final float entropy1=(len1>0 ? eTracker.averageEntropy(r1.bases, allowEntropyNs) : -1);
468 final float entropy2=(len2>0 ? eTracker.averageEntropy(r2.bases, allowEntropyNs) : -1);
469 if(/* usePairEntropy */ false){
470 final float entropy;
471 if(r2==null){
472 entropy=entropy1;
473 }else{
474 entropy=(entropy1*len1+entropy2*len2)/(len1+len2);
475 }
476 addToEntropyHistogram(entropy, len1+len2);
477 }else{
478 addToEntropyHistogram(entropy1, len1);
479 addToEntropyHistogram(entropy2, len2);
480 }
481 }
482
483 private void addToEntropyHistogram(final float entropy, final int len){
484 if(entropy<0 || len<1){return;}
485 entropyHist[Tools.min(ENTROPY_BINS, (int)(entropy*(ENTROPY_BINS+1)))]++;
486 }
487
488 public void addToIdentityHistogram(final Read r){
489 if(r==null){return;}
490 addToIdentityHistogram(r, 0);
491 if(r.mate!=null){addToIdentityHistogram(r.mate, 1);}
492 }
493
494 private void addToIdentityHistogram(final Read r, int pairnum){
495 if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
496 float id=r.identity();
497 idHist[(int)(id*ID_BINS)]++;
498 idBaseHist[(int)(id*ID_BINS)]+=r.length();
499 idMaxReadLen=Tools.max(r.length(), idMaxReadLen);
500 }
501
502 public void addToTimeHistogram(final Read r){
503 if(r==null){return;}
504 addToTimeHistogram(r, 0);//Time for pairs is the same.
505 }
506
507 private void addToTimeHistogram(final Read r, int pairnum){
508 if(r==null){return;}
509 assert(r.obj!=null && r.obj.getClass()==Long.class);
510 int x=(int)Tools.min(((Long)r.obj).longValue(), MAXTIMELEN);
511 timeHist.increment(x, 1);
512 }
513
514 public boolean addToIndelHistogram(final Read r){
515 if(r==null){return false;}
516 boolean success=addToIndelHistogram(r, 0);
517 if(r.mate!=null){success=addToIndelHistogram(r.mate, 1)|success;}
518 return success;
519 }
520
521 /** Handles short match, long match, and reads with attached SamLines */
522 private boolean addToIndelHistogram(final Read r, int pairnum){
523 if(r==null || !r.mapped()){return false;}
524 if(r.samline!=null){
525 boolean success=addToIndelHistogram(r.samline);
526 if(success){return true;}
527 }
528 if(r.match==null/* || r.discarded()*/){return false;}
529 final byte[] match=r.match;
530
531 byte lastLetter='?';
532 boolean digit=false;
533 int streak=0;
534 for(int mpos=0; mpos<match.length; mpos++){
535 final byte m=match[mpos];
536
537 if(Tools.isDigit(m)){
538 streak=streak*10+m-'0';
539 digit=true;
540 }else{
541 if(lastLetter==m){
542 streak++;
543 }else{
544 // assert(streak>0 || (streak==0 && lastm=='?'));
545 if(!digit){streak++;}
546 digit=false;
547 if(lastLetter=='D'){
548 streak=Tools.min(streak, MAXDELLEN2);
549 if(streak<MAXDELLEN){delHist.increment(streak, 1);}
550 delHist2.increment(streak/100, 1);
551 // System.err.println("A. Del: "+streak+", "+MAXDELLEN+", "+MAXDELLEN2);
552 }else if(lastLetter=='I'){
553 streak=Tools.min(streak, MAXINSLEN);
554 insHist.increment(streak, 1);
555 // System.err.println("A. Ins: "+streak+", "+MAXINSLEN);
556 }
557 streak=0;
558 }
559 lastLetter=m;
560 }
561 }
562
563 {//Final symbol
564 if(!digit){streak++;}
565 digit=false;
566 if(lastLetter=='D'){
567 streak=Tools.min(streak, MAXDELLEN2);
568 if(streak<MAXDELLEN){delHist.increment(streak, 1);}
569 delHist2.increment(streak/100, 1);
570 // System.err.println("A. Del: "+streak+", "+MAXDELLEN+", "+MAXDELLEN2);
571 }else if(lastLetter=='I'){
572 streak=Tools.min(streak, MAXINSLEN);
573 insHist.increment(streak, 1);
574 // System.err.println("A. Ins: "+streak+", "+MAXINSLEN);
575 }
576 streak=0;
577 }
578 return true;
579 }
580
581 private boolean addToIndelHistogram(SamLine sl){
582 if(sl==null || sl.cigar==null || !sl.mapped()){
583 return false;
584 }
585 final String cigar=sl.cigar;
586 // final int pairnum=sl.pairnum();
587
588 int count=0;
589 for(int cpos=0; cpos<cigar.length(); cpos++){
590 final char c=cigar.charAt(cpos);
591
592 if(Tools.isDigit(c)){
593 count=count*10+c-'0';
594 }else{
595 if(c=='I'){
596 int streak=Tools.min(count, MAXINSLEN);
597 insHist.increment(streak, 1);
598 // System.err.println("A. Ins: "+streak+", "+MAXINSLEN);
599 }else if(c=='D'){
600 int streak=Tools.min(count, MAXDELLEN2);
601 if(streak<MAXDELLEN){delHist.increment(streak, 1);}
602 delHist2.increment(streak/100, 1);
603 // System.err.println("A. Del: "+streak+", "+MAXDELLEN+", "+MAXDELLEN2);
604 }else{
605 //Ignore
606 }
607 count=0;
608 }
609 }
610 assert(count==0) : count;
611 return true;
612 }
613
614 public void addToMatchHistogram(final Read r){
615 if(r==null){return;}
616 addToMatchHistogram2(r);
617 if(r.mate!=null){addToMatchHistogram2(r.mate);}
618 }
619
620 private void addToMatchHistogram2(final Read r){
621 if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
622 final int pairnum=r.samline==null ? r.pairnum() : r.samline.pairnum();
623 if(pairnum==1){read2Count++;}
624 final byte[] bases=r.bases;
625 final int limit=Tools.min(bases.length, MAXLEN);
626 final long[] ms=matchSum[pairnum], ds=delSum[pairnum], is=insSum[pairnum],
627 ss=subSum[pairnum], ns=nSum[pairnum], cs=clipSum[pairnum], os=otherSum[pairnum];
628
629 byte[] match=r.match;
630 if(r.shortmatch() && match!=null){match=Read.toLongMatchString(match);}
631
632 if(match==null){
633 for(int i=0; i<limit; i++){
634 byte b=bases[i];
635 if(b=='N'){ns[i]++;}
636 else{os[i]++;}
637 }
638 }else{
639 final boolean plus=(r.strand()==0);
640 int bpos=0;
641 byte lastm='A';
642 for(int mpos=0; mpos<match.length && bpos<limit; mpos++){
643 byte b=bases[bpos];//bases[plus ? rpos : bases.length-rpos-1];
644 byte m=match[plus ? mpos : match.length-mpos-1];//match[mpos];
645 if(b=='N'){
646 if(m=='D'){
647 if(lastm!=m){ds[bpos]++;}
648 bpos--;
649 }else{ns[bpos]++;}
650 }else{
651 if(m=='m'){
652 ms[bpos]++;
653 }else if(m=='S'){
654 ss[bpos]++;
655 }else if(m=='I'){
656 is[bpos]++;
657 }else if(m=='N' || m=='V'){
658 // assert(false) : "\n"+r+"\n"+new String(Data.getChromosome(r.chrom).getBytes(r.start, r.stop))+"\nrpos="+rpos+", mpos="+mpos;
659 os[bpos]++;
660 }else if(m=='C'){
661 // assert(false) : r;
662 cs[bpos]++;
663 }else if(m=='D'){
664 if(lastm!=m){ds[bpos]++;}
665 bpos--;
666 }else if(m=='i'){
667 os[bpos]++;
668 }else if(m=='d'){
669 bpos--;
670 }else{
671 os[bpos]++;
672 assert(false) : "For read "+r.numericID+", unknown symbol in match string: ASCII "+m+" = "+(char)m;
673 }
674 }
675 bpos++;
676 lastm=m;
677 }
678 }
679 }
680
681 public void addToInsertHistogram(final Read r, boolean ignoreMappingStrand){
682 if(verbose){
683 System.err.print(r.numericID);
684 if(r==null || r.mate==null || !r.mapped() || !r.mate.mapped() || !r.paired()){
685 System.err.println("\n");
686 }else{
687 System.err.println("\t"+r.strand()+"\t"+r.insertSizeMapped(ignoreMappingStrand)+"\t"+r.mate.insertSizeMapped(ignoreMappingStrand));
688 }
689 }
690 if(r==null || r.mate==null || !r.mapped() || !r.mate.mapped() || !r.paired()){
691 unpairedCount++;
692 return;
693 }
694 int x=Tools.min(MAXINSERTLEN, r.insertSizeMapped(ignoreMappingStrand));
695 if(x>0){
696 insertHist.increment(x, 1);
697 pairedCount++;
698 }else{
699 unpairedCount++;
700 }
701 // assert(x!=1) : "\n"+r+"\n\n"+r.mate+"\n";
702 // System.out.println("Incrementing "+x);
703 }
704
705 public void addToInsertHistogram(final SamLine r1){
706 int x=r1.tlen;
707 if(x<0) {x=-x;}
708 x=Tools.min(MAXINSERTLEN, x);
709 if(r1.pairedOnSameChrom() && x>0){
710 pairedCount++;
711 insertHist.increment(x, 1);
712 }else {
713 unpairedCount++;
714 }
715 }
716
717 public void addToInsertHistogram(final SamLine r1, final SamLine r2){
718 if(r1==null){return;}
719 int x=insertSizeMapped(r1, r2, REQUIRE_PROPER_PAIR);
720 if(verbose){
721 System.err.println(r1.qname+"\t"+x);
722 }
723 x=Tools.min(MAXINSERTLEN, x);
724 if(x>0){
725 insertHist.increment(x, 1);
726 pairedCount++;
727 }else{
728 unpairedCount++;
729 }
730 }
731
732 /** This is untested and only gives approximate answers when overlapping reads contain indels.
733 * It may give incorrect answers for same-strange pairs that are shorter than read length.
734 * It might give negative answers but that would be a bug. */
735 public static int insertSizeMapped(SamLine r1, SamLine r2, boolean requireProperPair){
736 if(r2==null){return r1.length();}
737 if(!r1.mapped() || !r2.mapped() || !r1.pairedOnSameChrom() || (requireProperPair && !r1.properPair())){
738 return -1;
739 }
740
741 int a1=r1.start(true, false);
742 int a2=r2.start(true, false);
743
744 if(r1.strand()!=r2.strand()){
745 if(r1.strand()==1){return insertSizeMapped(r2, r1, requireProperPair);}
746 }else if(a1>a2){
747 return insertSizeMapped(r2, r1, requireProperPair);
748 }
749
750 int b1=r1.stop(a1, true, false);
751 int b2=r2.stop(a2, true, false);
752
753 int clen1=r1.calcCigarLength(true, false);
754 int clen2=r2.calcCigarLength(true, false);
755
756 int mlen1=b1-a1+1;
757 int mlen2=b2-a2+1;
758
759 int dif1=mlen1-clen1;
760 int dif2=mlen2-clen2;
761
762 int mlen12=b2-a1+1;
763
764 if(Tools.overlap(a1, b1, a2, b2)){//hard case
765 return mlen12-Tools.max(dif1, dif2); //Approximate
766 }else{//easy case
767 return mlen12-dif1-dif2;
768 }
769 }
770
771 public void addToBaseHistogram(final Read r){
772 addToBaseHistogram2(r);
773 if(r.mate!=null){addToBaseHistogram2(r.mate);}
774 }
775
776 public void addToBaseHistogram2(final Read r){
777 if(r==null || r.bases==null){return;}
778 final int pairnum=r.samline==null ? r.pairnum() : r.samline.pairnum();
779 if(pairnum==1){read2Count++;}
780 final byte[] bases=r.bases;
781 final LongList[] lists=baseHist[pairnum];
782 for(int i=0; i<bases.length; i++){
783 byte b=bases[i];
784 int x=AminoAcid.baseToNumber[b]+1;
785 lists[x].increment(i, 1);
786 }
787 }
788
789 public static boolean testFiles(boolean allowDuplicates){
790 return Tools.testOutputFiles(overwrite, append, allowDuplicates,
791 AVG_QUAL_HIST_FILE, QUAL_HIST_FILE, BQUAL_HIST_FILE, BQUAL_HIST_OVERALL_FILE, QUAL_COUNT_HIST_FILE,
792 MATCH_HIST_FILE, INSERT_HIST_FILE, BASE_HIST_FILE, QUAL_ACCURACY_FILE, INDEL_HIST_FILE, ERROR_HIST_FILE, LENGTH_HIST_FILE,
793 GC_HIST_FILE, ENTROPY_HIST_FILE, IDENTITY_HIST_FILE, TIME_HIST_FILE);
794 }
795
796 public static boolean writeAll(){
797 if(collectingStats()){
798 ReadStats rs=mergeAll();
799 boolean paired=rs.read2Count>0;
800
801 if(AVG_QUAL_HIST_FILE!=null){rs.writeAverageQualityToFile(AVG_QUAL_HIST_FILE, paired);}
802 if(QUAL_HIST_FILE!=null){rs.writeQualityToFile(QUAL_HIST_FILE, paired);}
803 if(BQUAL_HIST_FILE!=null){rs.writeBQualityToFile(BQUAL_HIST_FILE, paired);}
804 if(BQUAL_HIST_OVERALL_FILE!=null){rs.writeBQualityOverallToFile(BQUAL_HIST_OVERALL_FILE);}
805 if(QUAL_COUNT_HIST_FILE!=null){rs.writeQCountToFile(QUAL_COUNT_HIST_FILE, paired);}
806 if(MATCH_HIST_FILE!=null){rs.writeMatchToFile(MATCH_HIST_FILE, paired);}
807 if(INSERT_HIST_FILE!=null){rs.writeInsertToFile(INSERT_HIST_FILE);}
808 if(BASE_HIST_FILE!=null){rs.writeBaseContentToFile(BASE_HIST_FILE, paired);}
809 if(QUAL_ACCURACY_FILE!=null){rs.writeQualityAccuracyToFile(QUAL_ACCURACY_FILE);}
810
811 if(INDEL_HIST_FILE!=null){rs.writeIndelToFile(INDEL_HIST_FILE);}
812 if(ERROR_HIST_FILE!=null){rs.writeErrorToFile(ERROR_HIST_FILE);}
813 if(LENGTH_HIST_FILE!=null){rs.writeLengthToFile(LENGTH_HIST_FILE);}
814 if(GC_HIST_FILE!=null){rs.writeGCToFile(GC_HIST_FILE, true);}
815 if(ENTROPY_HIST_FILE!=null){rs.writeEntropyToFile(ENTROPY_HIST_FILE, true);}
816 if(IDENTITY_HIST_FILE!=null){rs.writeIdentityToFile(IDENTITY_HIST_FILE, true);}
817 if(TIME_HIST_FILE!=null){rs.writeTimeToFile(TIME_HIST_FILE);}
818
819 boolean b=rs.errorState;
820 clear();
821 return b;
822 }
823 clear();
824 return false;
825 }
826
827 public void writeAverageQualityToFile(String fname, boolean writePaired){
828 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
829 tsw.start();
830 tsw.print("#Quality\tcount1\tfraction1"+(writePaired ? "\tcount2\tfraction2" : "")+"\n");
831
832 long sum1=Tools.sum(aqualArray[0]);
833 long sum2=Tools.sum(aqualArray[1]);
834 double mult1=1.0/Tools.max(1, sum1);
835 double mult2=1.0/Tools.max(1, sum2);
836
837 long y=sum1+sum2;
838 for(int i=0; i<aqualArray[0].length; i++){
839 long x1=aqualArray[0][i];
840 long x2=aqualArray[1][i];
841 y-=x1;
842 y-=x2;
843 tsw.print(String.format(Locale.ROOT, "%d\t%d\t%.5f", i, x1, x1*mult1));
844 if(writePaired){
845 tsw.print(String.format(Locale.ROOT, "\t%d\t%.5f", x2, x2*mult2));
846 }
847 tsw.print("\n");
848 if(y<=0){break;}
849 }
850 tsw.poisonAndWait();
851 errorState|=tsw.errorState;
852 }
853
854 public void writeQCountToFile(String fname, boolean writePaired){
855 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
856 tsw.start();
857 tsw.print("#Quality\tcount1\tfraction1"+(writePaired ? "\tcount2\tfraction2" : "")+"\n");
858
859 long sum1=Tools.sum(qcountHist[0]);
860 long sum2=Tools.sum(qcountHist[1]);
861 double mult1=1.0/Tools.max(1, sum1);
862 double mult2=1.0/Tools.max(1, sum2);
863
864 long y=sum1+sum2;
865 for(int i=0; i<qcountHist[0].length; i++){
866 long x1=qcountHist[0][i];
867 long x2=qcountHist[1][i];
868 y-=x1;
869 y-=x2;
870 tsw.print(String.format(Locale.ROOT, "%d\t%d\t%.5f", i, x1, x1*mult1));
871 if(writePaired){
872 tsw.print(String.format(Locale.ROOT, "\t%d\t%.5f", x2, x2*mult2));
873 }
874 tsw.print("\n");
875 if(y<=0){break;}
876 }
877 tsw.poisonAndWait();
878 errorState|=tsw.errorState;
879 }
880
881 public void writeQualityToFile(String fname, boolean writePaired){
882 StringBuilder sb=new StringBuilder();
883 final boolean measure=(matchSum!=null);
884 if(measure){
885 if(writePaired){
886 sb.append("#BaseNum\tRead1_linear\tRead1_log\tRead1_measured\tRead2_linear\tRead2_log\tRead2_measured\n");
887 }else{
888 sb.append("#BaseNum\tRead1_linear\tRead1_log\tRead1_measured\n");
889 }
890 }else{
891 sb.append("#BaseNum\tRead1_linear\tRead1_log"+(writePaired ? "\tRead2_linear\tRead2_log" : "")+"\n");
892 }
893
894 final long[] qs1=qualSum[0], qs2=qualSum[1], ql1=qualLength[0], ql2=qualLength[1];
895 final double[] qsd1=qualSumDouble[0], qsd2=qualSumDouble[1];
896
897 for(int i=MAXLEN-2; i>=0; i--){
898 ql1[i]+=ql1[i+1];
899 ql2[i]+=ql2[i+1];
900 }
901
902 double div1sum=0;
903 double div2sum=0;
904 double deviation1sum=0;
905 double deviation2sum=0;
906
907 if(writePaired){
908 for(int i=0; i<MAXLEN && (ql1[i]>0 || ql2[i]>0); i++){
909 final int a=i+1;
910 double blin, clin, blog, clog;
911 final double div1=(double)Tools.max(1, ql1[i]);
912 final double div2=(double)Tools.max(1, ql2[i]);
913
914 blin=qs1[i]/div1;
915 clin=qs2[i]/div2;
916 blog=qsd1[i]/div1;
917 clog=qsd2[i]/div2;
918 blog=QualityTools.probErrorToPhredDouble(blog);
919 clog=QualityTools.probErrorToPhredDouble(clog);
920 if(measure){
921 double bcalc=calcQualityAtPosition(i, 0);
922 double ccalc=calcQualityAtPosition(i, 1);
923
924 div1sum+=div1;
925 div2sum+=div2;
926 deviation1sum+=Math.abs(blog-bcalc)*div1;
927 deviation2sum+=Math.abs(clog-ccalc)*div2;
928
929 sb.append(String.format(Locale.ROOT, "%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", a, blin, blog, bcalc, clin, clog, ccalc));
930 }else{
931 sb.append(String.format(Locale.ROOT, "%d\t%.3f\t%.3f\t%.3f\t%.3f\n", a, blin, blog, clin, clog));
932 }
933 }
934 }else{
935 for(int i=0; i<MAXLEN && ql1[i]>0; i++){
936 final int a=i+1;
937 double blin, blog;
938 final double div1=(double)Tools.max(1, ql1[i]);
939
940 blin=qs1[i]/div1;
941 blog=qsd1[i]/div1;
942 blog=QualityTools.probErrorToPhredDouble(blog);
943 if(measure){
944 double bcalc=calcQualityAtPosition(i, 0);
945
946 div1sum+=div1;
947 deviation1sum+=Math.abs(blog-bcalc)*div1;
948
949 sb.append(String.format(Locale.ROOT, "%d\t%.3f\t%.3f\t%.3f\n", a, blin, blog, bcalc));
950 }else{
951 sb.append(String.format(Locale.ROOT, "%d\t%.3f\t%.3f\n", a, blin, blog));
952 }
953 }
954 }
955
956 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
957 tsw.start();
958 if(measure){
959 if(writePaired){
960 tsw.print(String.format(Locale.ROOT, "#Deviation\t%.4f\t%.4f\n", deviation1sum/div1sum, deviation2sum/div2sum));
961 }else{
962 tsw.print(String.format(Locale.ROOT, "#Deviation\t%.4f\n", deviation1sum/div1sum));
963 }
964 }
965 tsw.print(sb);
966 tsw.poisonAndWait();
967 errorState|=tsw.errorState;
968 }
969
970 private double calcQualityAtPosition(int pos, int pairnum){
971 boolean includeNs=true;
972 long m=matchSum[pairnum][pos];
973 long d=delSum[pairnum][pos]; //left-adjacent deletion
974 long d2=delSum[pairnum][Tools.min(pos, delSum[pairnum].length-1)]; //right-adjacent deletion
975 long i=insSum[pairnum][pos];
976 long s=subSum[pairnum][pos];
977 long n=(includeNs ? nSum[pairnum][pos] : 0); //This only tracks no-calls, not no-refs.
978 long good=Tools.max(0, m*2-d-d2+n/2);
979 long total=Tools.max(0, m*2+i*2+s*2+n*2); //not d
980 long bad=total-good;
981 if(total<1){return 0;}
982 double error=bad/(double)total;
983 return QualityTools.probErrorToPhredDouble(error);
984 }
985
986 public void writeBQualityOverallToFile(String fname){
987 final long[] cp30=Arrays.copyOf(bqualHistOverall, bqualHistOverall.length);
988 for(int i=0; i<30; i++){cp30[i]=0;}
989
990 final long sum=Tools.sum(bqualHistOverall);
991 final long median=Tools.percentileHistogram(bqualHistOverall, 0.5);
992 final double mean=Tools.averageHistogram(bqualHistOverall);
993 final double stdev=Tools.standardDeviationHistogram(bqualHistOverall);
994 final double mean30=Tools.averageHistogram(cp30);
995 final double stdev30=Tools.standardDeviationHistogram(cp30);
996 final double mult=1.0/Tools.max(1, sum);
997 long y=sum;
998
999 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
1000 tsw.start();
1001 tsw.print("#Median\t"+median+"\n");
1002 tsw.print("#Mean\t"+String.format(Locale.ROOT, "%.3f", mean)+"\n");
1003 tsw.print("#STDev\t"+String.format(Locale.ROOT, "%.3f", stdev)+"\n");
1004 tsw.print("#Mean_30\t"+String.format(Locale.ROOT, "%.3f", mean30)+"\n");
1005 tsw.print("#STDev_30\t"+String.format(Locale.ROOT, "%.3f", stdev30)+"\n");
1006 tsw.print("#Quality\tbases\tfraction\n");
1007
1008 for(int i=0; i<bqualHistOverall.length; i++){
1009 long x=bqualHistOverall[i];
1010 y-=x;
1011 tsw.print(String.format(Locale.ROOT, "%d\t%d\t%.5f", i, x, x*mult));
1012 tsw.print("\n");
1013 if(y<=0){break;}
1014 }
1015 tsw.poisonAndWait();
1016 errorState|=tsw.errorState;
1017 }
1018
1019 public void writeBQualityToFile(String fname, boolean writePaired){
1020 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
1021 tsw.start();
1022 tsw.print("#BaseNum\tcount_1\tmin_1\tmax_1\tmean_1\tQ1_1\tmed_1\tQ3_1\tLW_1\tRW_1");
1023 if(writePaired){tsw.print("\tcount_2\tmin_2\tmax_2\tmean_2\tQ1_2\tmed_2\tQ3_2\tLW_2\tRW_2");}
1024 tsw.print("\n");
1025
1026 for(int i=0; i<MAXLEN; i++){
1027 final long[] a1=bqualHist[0][i], a2=bqualHist[1][i];
1028 final long sum1=Tools.sum(a1), sum2=Tools.sum(a2);
1029 if(sum1<1 && sum2<1){break;}
1030
1031 {
1032 final long a[]=a1, sum=sum1;
1033
1034 final long weightedSum=Tools.sumHistogram(a);
1035 final long med=Tools.medianHistogram(a), min=Tools.minHistogram(a), max=Tools.maxHistogram(a);
1036 final long firstQuart=Tools.percentileHistogram(a, 0.25);
1037 final long thirdQuart=Tools.percentileHistogram(a, 0.75);
1038 final long leftWhisker=Tools.percentileHistogram(a, 0.02);
1039 final long rightWhisker=Tools.percentileHistogram(a, 0.98);
1040 final double mean=weightedSum*1.0/Tools.max(sum, 0);
1041 tsw.print(String.format(Locale.ROOT, "%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\t%d", i, sum, min, max, mean, firstQuart, med, thirdQuart, leftWhisker, rightWhisker));
1042 }
1043
1044 if(writePaired){
1045 final long a[]=a2, sum=sum2;
1046
1047 final long weightedSum=Tools.sumHistogram(a);
1048 final long med=Tools.medianHistogram(a), min=Tools.minHistogram(a), max=Tools.maxHistogram(a);
1049 final long firstQuart=Tools.percentileHistogram(a, 0.25);
1050 final long thirdQuart=Tools.percentileHistogram(a, 0.75);
1051 final long leftWhisker=Tools.percentileHistogram(a, 0.02);
1052 final long rightWhisker=Tools.percentileHistogram(a, 0.98);
1053 final double mean=weightedSum*1.0/Tools.max(sum, 0);
1054 tsw.print(String.format(Locale.ROOT, "\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\t%d", sum, min, max, mean, firstQuart, med, thirdQuart, leftWhisker, rightWhisker));
1055 }
1056 tsw.print("\n");
1057 }
1058 tsw.poisonAndWait();
1059 errorState|=tsw.errorState;
1060 }
1061
1062 public void writeQualityAccuracyToFile(String fname){
1063
1064 int max=qualMatch.length;
1065 for(int i=max-1; i>=0; i--){
1066 if(qualMatch[i]+qualSub[i]+qualIns[i]+qualDel[i]>0){break;}
1067 max=i;
1068 }
1069
1070 final int qMin=Read.MIN_CALLED_QUALITY(), qMax=Read.MAX_CALLED_QUALITY();
1071
1072 double devsum=0;
1073 double devsumSub=0;
1074 long observations=0;
1075 for(int i=0; i<max; i++){
1076 long qm=qualMatch[i]*2;
1077 long qs=qualSub[i]*2;
1078 long qi=qualIns[i]*2;
1079 long qd=qualDel[i];
1080
1081 double phred=-1;
1082 double phredSub=-1;
1083
1084 long sum=qm+qs+qi+qd;
1085 if(sum>0){
1086 double mult=1.0/sum;
1087 double subRate=(qs)*mult;
1088 double errorRate=(qs+qi+qd)*mult;
1089
1090 phredSub=QualityTools.probErrorToPhredDouble(subRate);
1091 phred=QualityTools.probErrorToPhredDouble(errorRate);
1092 double deviation=phred-i;
1093 double deviationSub=phredSub-i;
1094 if(i==qMin && deviation<0){deviation=0;}
1095 else if(i==qMax && max==qMax+1 && deviation>0){deviation=0;}
1096 if(i==qMin && deviationSub<0){deviationSub=0;}
1097 else if(i==qMax && max==qMax+1 && deviationSub>0){deviationSub=0;}
1098 devsum+=(Math.abs(deviation)*sum);
1099 devsumSub+=(Math.abs(deviationSub)*sum);
1100 observations+=sum;
1101 }
1102 }
1103
1104 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
1105 tsw.start();
1106 tsw.print(String.format(Locale.ROOT, "#Deviation\t%.3f\n", devsum/observations));
1107 tsw.print(String.format(Locale.ROOT, "#DeviationSub\t%.3f\n", devsumSub/observations));
1108 tsw.print("#Quality\tMatch\tSub\tIns\tDel\tTrueQuality\tTrueQualitySub\n");
1109 for(int i=0; i<max; i++){
1110 long qm=qualMatch[i]*2;
1111 long qs=qualSub[i]*2;
1112 long qi=qualIns[i]*2;
1113 long qd=qualDel[i];
1114
1115 double phred=-1;
1116 double phredSub=-1;
1117
1118 long sum=qm+qs+qi+qd;
1119 if(sum>0){
1120 double mult=1.0/sum;
1121 double subRate=(qs)*mult;
1122 double errorRate=(qs+qi+qd)*mult;
1123
1124 phredSub=QualityTools.probErrorToPhredDouble(subRate);
1125 phred=QualityTools.probErrorToPhredDouble(errorRate);
1126
1127 // System.err.println("sub: "+qs+"/"+sum+" -> "+subRate+" -> "+phredSub);
1128 }
1129
1130 tsw.print(i+"\t"+qm+"\t"+qs+"\t"+qi+"\t"+qd);
1131 tsw.print(phred>=0 ? String.format(Locale.ROOT, "\t%.2f", phred) : "\t");
1132 tsw.print(phredSub>=0 ? String.format(Locale.ROOT, "\t%.2f\n", phredSub) : "\t\n");
1133
1134 // System.err.println(qm+"\t"+qs+"\t"+qi+"\t"+qd);
1135 }
1136
1137 tsw.poisonAndWait();
1138 errorState|=tsw.errorState;
1139 }
1140
1141 public void writeMatchToFile(String fname, boolean writePaired){
1142 if(!writePaired){
1143 writeMatchToFileUnpaired(fname);
1144 return;
1145 }
1146 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1147 tsw.start();
1148 tsw.print("#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\tMatch2\tSub2\tDel2\tIns2\tN2\tOther2\n");
1149
1150 final long[] ms1=matchSum[0], ds1=delSum[0], is1=insSum[0],
1151 ss1=subSum[0], ns1=nSum[0], cs1=clipSum[0], os1=otherSum[0];
1152 final long[] ms2=matchSum[1], ds2=delSum[1], is2=insSum[1],
1153 ss2=subSum[1], ns2=nSum[1], cs2=clipSum[1], os2=otherSum[1];
1154
1155 for(int i=0; i<MAXLEN; i++){
1156 int a=i+1;
1157 long sum1=ms1[i]+is1[i]+ss1[i]+ns1[i]+cs1[i]+os1[i]; //no deletions
1158 long sum2=ms2[i]+is2[i]+ss2[i]+ns2[i]+cs2[i]+os2[i]; //no deletions
1159 if(sum1==0 && sum2==0){break;}
1160 double inv1=1.0/(double)Tools.max(1, sum1);
1161 double inv2=1.0/(double)Tools.max(1, sum2);
1162
1163 tsw.print(String.format(Locale.ROOT, "%d", a));
1164 tsw.print(String.format(Locale.ROOT, "\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f",
1165 ms1[i]*inv1, ss1[i]*inv1, ds1[i]*inv1, is1[i]*inv1, ns1[i]*inv1, (os1[i]+cs1[i])*inv1));
1166 tsw.print(String.format(Locale.ROOT, "\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f",
1167 ms2[i]*inv2, ss2[i]*inv2, ds2[i]*inv2, is2[i]*inv2, ns2[i]*inv2, (os2[i]+cs2[i])*inv2)
1168 // +", "+ms2[i]+", "+is2[i]+", "+ss2[i]+", "+ns2[i]+", "+cs2[i]+", "+os2[i]
1169 );
1170 tsw.print("\n");
1171 }
1172 tsw.poisonAndWait();
1173 errorState|=tsw.errorState;
1174 }
1175
1176 public void writeMatchToFileUnpaired(String fname){
1177 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1178 tsw.start();
1179 tsw.print("#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\n");
1180
1181 final long[] ms1=matchSum[0], ds1=delSum[0], is1=insSum[0],
1182 ss1=subSum[0], ns1=nSum[0], cs1=clipSum[0], os1=otherSum[0];
1183
1184 for(int i=0; i<MAXLEN; i++){
1185 int a=i+1;
1186 long sum1=ms1[i]+is1[i]+ss1[i]+ns1[i]+cs1[i]+os1[i]; //no deletions
1187 if(sum1==0){break;}
1188 double inv1=1.0/(double)Tools.max(1, sum1);
1189
1190 tsw.print(String.format(Locale.ROOT, "%d", a));
1191 tsw.print(String.format(Locale.ROOT, "\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f",
1192 ms1[i]*inv1, ss1[i]*inv1, ds1[i]*inv1, is1[i]*inv1, ns1[i]*inv1, (os1[i]+cs1[i])*inv1)
1193 // +", "+ms1[i]+", "+is1[i]+", "+ss1[i]+", "+ns1[i]+", "+cs1[i]+", "+os1[i]
1194 );
1195 tsw.print("\n");
1196 }
1197 tsw.poisonAndWait();
1198 errorState|=tsw.errorState;
1199 }
1200
1201 public void writeInsertToFile(String fname){
1202 StringBuilder sb=new StringBuilder();
1203 sb.append("#Mean\t"+String.format(Locale.ROOT, "%.3f", Tools.averageHistogram(insertHist.array))+"\n");
1204 sb.append("#Median\t"+Tools.percentileHistogram(insertHist.array, 0.5)+"\n");
1205 sb.append("#Mode\t"+Tools.calcModeHistogram(insertHist.array)+"\n");
1206 sb.append("#STDev\t"+String.format(Locale.ROOT, "%.3f", Tools.standardDeviationHistogram(insertHist.array))+"\n");
1207 double percent=pairedCount*100.0/(pairedCount+unpairedCount);
1208 sb.append("#PercentOfPairs\t"+String.format(Locale.ROOT, "%.3f", percent)+"\n");
1209 // sb.append("#PercentOfPairs\t"+String.format(Locale.ROOT, "%.3f", matedPercent)+"\n");
1210 sb.append("#InsertSize\tCount\n");
1211 writeHistogramToFile(fname, sb.toString(), insertHist, !skipZeroInsertCount);
1212 }
1213
1214 public void writeBaseContentToFile(String fname, boolean paired){
1215 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
1216 bsw.start();
1217 bsw.print("#Pos\tA\tC\tG\tT\tN\n");
1218
1219 int max=writeBaseContentToFile2(bsw, baseHist[0], 0);
1220 if(paired){
1221 writeBaseContentToFile2(bsw, baseHist[1], max);
1222 }
1223
1224 bsw.poisonAndWait();
1225 errorState|=bsw.errorState;
1226 }
1227
1228 private static int writeBaseContentToFile2(ByteStreamWriter bsw, LongList[] lists, int offset){
1229 int max=0;
1230 ByteBuilder sb=new ByteBuilder(100);
1231 for(LongList ll : lists){max=Tools.max(max, ll.size);}
1232 for(int i=0; i<max; i++){
1233 long a=lists[1].get(i);
1234 long c=lists[2].get(i);
1235 long g=lists[3].get(i);
1236 long t=lists[4].get(i);
1237 long n=lists[0].get(i);
1238 double mult=1.0/(a+c+g+t+n);
1239
1240 sb.setLength(0);
1241 sb.append(i+offset).tab();
1242 sb.append(a*mult, 5).tab();
1243 sb.append(c*mult, 5).tab();
1244 sb.append(g*mult, 5).tab();
1245 sb.append(t*mult, 5).tab();
1246 sb.append(n*mult, 5);
1247 sb.nl();
1248
1249 bsw.print(sb);
1250 }
1251 return max;
1252 }
1253
1254 public void writeIndelToFile(String fname){
1255 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
1256 bsw.start();
1257 bsw.print("#Length\tDeletions\tInsertions\n");
1258
1259 int max=Tools.max(insHist.size, delHist.size);
1260
1261 ByteBuilder bb=new ByteBuilder(100);
1262 for(int i=0; i<max; i++){
1263 long x=delHist.get(i);
1264 long y=insHist.get(i);
1265 if(x>0 || y>0 || !skipZeroIndel){
1266 bb.clear();
1267 bb.append(i).tab().append(x).tab().append(y).nl();
1268 bsw.print(bb);
1269 }
1270 }
1271
1272 //TODO: Disabled because it was irritating when graphing. Should write to a different file.
1273 // tsw.print("#Length_bin\tDeletions\n");
1274 // max=delHist2.size;
1275 // for(int i=0; i<max; i++){
1276 // long x=delHist2.get(i);
1277 // if(x>0 || !skipZeroIndel){
1278 // tsw.print((i*DEL_BIN)+"\t"+x+"\n");
1279 // }
1280 // }
1281
1282 bsw.poisonAndWait();
1283 errorState|=bsw.errorState;
1284 }
1285
1286 public void writeErrorToFile(String fname){
1287 writeHistogramToFile(fname, "#Errors\tCount\n", errorHist, false);
1288 }
1289
1290 public void writeLengthToFile(String fname){
1291 writeHistogramToFile(fname, "#Length\tCount\n", lengthHist, false);
1292 }
1293
1294 public void writeTimeToFile(String fname){
1295 writeHistogramToFile(fname, "#Time\tCount\n", timeHist, false);
1296 }
1297
1298 public void writeHistogramToFile(String fname, String header, LongList hist, boolean printZeros){
1299 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
1300 bsw.start();
1301 bsw.print(header);
1302
1303 int max=hist.size;
1304
1305 ByteBuilder bb=new ByteBuilder(40);
1306 for(int i=0; i<max; i++){
1307 long x=hist.get(i);
1308 if(x>0 || printZeros){
1309 bb.clear().append(i).tab().append(x).nl();
1310 bsw.print(bb);
1311 }
1312 }
1313 bsw.poisonAndWait();
1314 errorState|=bsw.errorState;
1315 }
1316
1317 public void writeHistogramToFile(String fname, String header, SuperLongList hist, boolean printZeros){
1318 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
1319 bsw.start();
1320 bsw.print(header);
1321
1322 hist.sort();
1323 long max=hist.max();
1324 long[] array=hist.array();
1325 LongList list=hist.list();
1326
1327 ByteBuilder bb=new ByteBuilder(40);
1328 for(int i=0; i<array.length; i++){
1329 long x=array[i];
1330 if(x>0 || printZeros){
1331 bb.clear().append(i).tab().append(x).nl();
1332 bsw.print(bb);
1333 }
1334 }
1335 {//TODO: This part does not bother printing zeros regardless of the flag.
1336 long prev=-1;
1337 int streak=0;
1338 for(int i=0; i<list.size(); i++){
1339 long x=list.get(i);
1340 if(x==prev){streak++;}
1341 else{
1342 if(streak>0){//Print previous key
1343 bb.clear().append(prev).tab().append(streak).nl();
1344 bsw.print(bb);
1345 }
1346 streak=1;
1347 prev=x;
1348 }
1349 }
1350 if(streak>0){//Print final key
1351 bb.clear().append(prev).tab().append(streak).nl();
1352 bsw.print(bb);
1353 }
1354 }
1355 bsw.poisonAndWait();
1356 errorState|=bsw.errorState;
1357 }
1358
1359 public void writeGCToFile(String fname, boolean printZeros){
1360 final long[] hist;
1361 if(GC_BINS_AUTO && gcMaxReadLen+1<gcHist.length){
1362 hist=Tools.downsample(gcHist, gcMaxReadLen+1);
1363 }else{
1364 hist=gcHist;
1365 }
1366 final int bins=hist.length;
1367 final double gcMult=100.0/Tools.max(1, bins-1);
1368 final long total=Tools.sum(hist);
1369 final long max=Tools.max(hist);
1370 final double countsPerX=Tools.max(1, ((max*1000.0)/40));
1371 final double fractionMult=1.0/Tools.max(1, total);
1372 long sum=0;
1373
1374 GCMean=Tools.averageHistogram(hist)*gcMult;
1375 GCMedian=Tools.percentileHistogram(hist, 0.5)*gcMult;
1376 GCMode=Tools.calcModeHistogram(hist)*gcMult;
1377 GCSTDev=Tools.standardDeviationHistogram(hist)*gcMult;
1378
1379 ByteBuilder bb=new ByteBuilder(256);
1380 bb.append("#Mean\t").append(GCMean, 3).nl();
1381 bb.append("#Median\t").append(GCMedian, 3).nl();
1382 bb.append("#Mode\t").append(GCMode, 3).nl();
1383 bb.append("#STDev\t").append(GCSTDev, 3).nl();
1384 if(GC_PLOT_X){
1385 bb.append("#GC\tCount\tCumulative\tPlot\n");
1386 }else{
1387 bb.append("#GC\tCount\n");
1388 }
1389
1390
1391 // bsw.print("#Mean\t"+String.format(Locale.ROOT, "%.3f", GCMean)+"\n");
1392 // bsw.print("#Median\t"+String.format(Locale.ROOT, "%.3f", GCMedian)+"\n");
1393 // bsw.print("#Mode\t"+String.format(Locale.ROOT, "%.3f", GCMode)+"\n");
1394 // bsw.print("#STDev\t"+String.format(Locale.ROOT, "%.3f", GCSTDev)+"\n");
1395 // if(GC_PLOT_X){
1396 // bsw.print("#GC\tCount\tCumulative\tPlot\n");
1397 // }else{
1398 // bsw.print("#GC\tCount\n");
1399 // }
1400
1401 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
1402 bsw.start();
1403 bsw.print(bb);
1404
1405 for(int i=0; i<bins; i++){
1406 long x=hist[i];
1407 sum+=x;
1408 if(x>0 || printZeros){
1409 //This assumes GC_BINS==100
1410 // tsw.print(i+"\t"+x+"\n");
1411 if(GC_PLOT_X){
1412 bb.clear();
1413 bb.append(i*gcMult, 1).tab().append(x).tab();
1414 bb.append(sum*fractionMult, 3).tab();
1415
1416 int len=(int)((x*1000)/countsPerX);
1417 for(int j=0; j<len; j++){bb.append('X');}
1418 if(len<1 && x>0){
1419 if((x*1000f)/countsPerX>0.1f){bb.append('x');}
1420 else{bb.append('.');}
1421 }
1422
1423 bb.append('\n');
1424 bsw.print(bb);
1425 }else{
1426 bb.clear().append(i*gcMult, 1).tab().append(x).nl();
1427 bsw.print(bb);
1428 }
1429 }
1430 }
1431 bsw.poisonAndWait();
1432 errorState|=bsw.errorState;
1433 }
1434
1435 public void writeEntropyToFile(String fname, boolean printZeros){
1436 final long[] hist=entropyHist;
1437
1438 final int bins=hist.length;
1439 final double mult=1.0/Tools.max(1, bins-1);
1440 final long total=Tools.sum(hist);
1441 final long max=Tools.max(hist);
1442 final double countsPerX=Tools.max(1, ((max*1000.0)/40));
1443 final double fractionMult=1.0/Tools.max(1, total);
1444 long sum=0;
1445
1446 double mean=Tools.averageHistogram(hist)*mult;
1447 double median=Tools.percentileHistogram(hist, 0.5)*mult;
1448 double mode=Tools.calcModeHistogram(hist)*mult;
1449 double stdev=Tools.standardDeviationHistogram(hist)*mult;
1450
1451 ByteBuilder bb=new ByteBuilder(256);
1452 bb.append("#Mean\t").append(mean, 6).nl();
1453 bb.append("#Median\t").append(median, 6).nl();
1454 bb.append("#Mode\t").append(mode, 6).nl();
1455 bb.append("#STDev\t").append(stdev, 6).nl();
1456 bb.append("#Value\tCount\n");
1457
1458 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, false);
1459 bsw.start();
1460 bsw.print(bb);
1461
1462 for(int i=0; i<bins; i++){
1463 long x=hist[i];
1464 sum+=x;
1465 if(x>0 || printZeros){
1466 bb.clear().append(i*mult, 4).tab().append(x).nl();
1467 bsw.print(bb);
1468 }
1469 }
1470 bsw.poisonAndWait();
1471 errorState|=bsw.errorState;
1472 }
1473
1474 public void writeIdentityToFile(String fname, boolean printZeros){
1475 final long[] hist, histb;
1476 if(ID_BINS_AUTO && idMaxReadLen+1<idHist.length){
1477 hist=Tools.downsample(idHist, idMaxReadLen+1);
1478 histb=Tools.downsample(idBaseHist, idMaxReadLen+1);
1479 }else{
1480 hist=idHist;
1481 histb=idBaseHist;
1482 }
1483 final int max=hist.length;
1484 final double mult=100.0/(max-1);
1485
1486 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1487 tsw.start();
1488
1489
1490 tsw.print("#Mean_reads\t"+String.format(Locale.ROOT, "%.3f", (Tools.averageHistogram(hist)*mult))+"\n");
1491 tsw.print("#Mean_bases\t"+(String.format(Locale.ROOT, "%.3f", Tools.averageHistogram(histb)*mult))+"\n");
1492 tsw.print("#Median_reads\t"+(int)Math.round(Tools.percentileHistogram(hist, 0.5)*mult)+"\n");
1493 tsw.print("#Median_bases\t"+(int)Math.round(Tools.percentileHistogram(histb, 0.5)*mult)+"\n");
1494 tsw.print("#Mode_reads\t"+(int)Math.round(Tools.calcModeHistogram(hist)*mult)+"\n");
1495 tsw.print("#Mode_bases\t"+(int)Math.round(Tools.calcModeHistogram(histb)*mult)+"\n");
1496 tsw.print("#STDev_reads\t"+String.format(Locale.ROOT, "%.3f", (Tools.standardDeviationHistogram(hist)*mult))+"\n");
1497 tsw.print("#STDev_bases\t"+String.format(Locale.ROOT, "%.3f", (Tools.standardDeviationHistogram(histb)*mult))+"\n");
1498 tsw.print("#Identity\tReads\tBases\n");
1499
1500 for(int i=0; i<max; i++){
1501 long x=hist[i], x2=histb[i];
1502 if(x>0 || printZeros){
1503 tsw.print(String.format(Locale.ROOT, "%.1f", i*mult)+"\t"+x+"\t"+x2+"\n");
1504 }
1505 }
1506 tsw.poisonAndWait();
1507 errorState|=tsw.errorState;
1508 }
1509
1510 //Tracks to see if read2s have been encountered, for displaying stats.
1511 private long read2Count=0;
1512
1513 public long pairedCount=0;
1514 public long unpairedCount=0;
1515
1516 public final long[][] aqualArray;
1517 public final long[][] qualLength;
1518 public final long[][] qualSum;
1519
1520 public final long[][][] bqualHist;
1521 public final long[] bqualHistOverall;
1522
1523 public final long[][] qcountHist;
1524
1525 public final double[][] qualSumDouble;
1526
1527 public final long[][] matchSum;
1528 public final long[][] delSum;
1529 public final long[][] insSum;
1530 public final long[][] subSum;
1531 public final long[][] nSum;
1532 public final long[][] clipSum;
1533 public final long[][] otherSum;
1534
1535 public final long[] qualMatch;
1536 public final long[] qualSub;
1537 public final long[] qualIns;
1538 public final long[] qualDel;
1539
1540 public final long[] gcHist;
1541 public final long[] entropyHist;
1542 public final EntropyTracker eTracker;
1543 public final long[] idHist;
1544 public final long[] idBaseHist;
1545 private int gcMaxReadLen=1;
1546 private int idMaxReadLen=1;
1547
1548 public final LongList[][] baseHist;
1549
1550 /** Insert size */
1551 public final LongList insertHist;
1552 /** Read length */
1553 public final SuperLongList lengthHist;
1554 /** Number errors per read */
1555 public final LongList errorHist;
1556 /** Insertion length */
1557 public final LongList insHist;
1558 /** Deletion length */
1559 public final LongList delHist;
1560 /** Deletion length, binned */
1561 public final LongList delHist2;
1562 /** Time */
1563 public final LongList timeHist;
1564
1565 public static boolean REQUIRE_PROPER_PAIR=true;
1566 public static int MAXLEN=6000;
1567 public static int MAXINSERTLEN=80000;
1568 @Deprecated
1569 public static int MAXLENGTHLEN=80000;
1570 public static final int MAXTIMELEN=80000;
1571 public static final int MAXINSLEN=1000;
1572 public static final int MAXDELLEN=1000;
1573 public static final int MAXDELLEN2=1000000;
1574 public static final int DEL_BIN=100;
1575 public static int ID_BINS=100;
1576 public static boolean ID_BINS_AUTO=false;
1577 public static int GC_BINS=100;
1578 public static boolean GC_BINS_AUTO=false;
1579 public static boolean GC_PLOT_X=false;
1580 public static int ENTROPY_BINS=1000;
1581
1582 public static double GCMean;
1583 public static double GCMedian;
1584 public static double GCMode;
1585 public static double GCSTDev;
1586
1587 // public static double entropyMean;
1588 // public static double entropyMedian;
1589 // public static double entropyMode;
1590 // public static double entropySTDev;
1591
1592 public boolean errorState=false;
1593
1594 public static ReadStats merged=null;
1595
1596 // public static double matedPercent=0;
1597
1598 public static void clear(){
1599 COLLECT_QUALITY_STATS=false;
1600 COLLECT_QUALITY_ACCURACY=false;
1601 COLLECT_MATCH_STATS=false;
1602 COLLECT_INSERT_STATS=false;
1603 COLLECT_BASE_STATS=false;
1604 COLLECT_INDEL_STATS=false;
1605 COLLECT_GC_STATS=false;
1606 COLLECT_ENTROPY_STATS=false;
1607 COLLECT_ERROR_STATS=false;
1608 COLLECT_LENGTH_STATS=false;
1609 COLLECT_IDENTITY_STATS=false;
1610 COLLECT_TIME_STATS=false;
1611
1612 AVG_QUAL_HIST_FILE=null;
1613 QUAL_HIST_FILE=null;
1614 BQUAL_HIST_FILE=null;
1615 QUAL_COUNT_HIST_FILE=null;
1616 BQUAL_HIST_OVERALL_FILE=null;
1617 QUAL_ACCURACY_FILE=null;
1618 MATCH_HIST_FILE=null;
1619 INSERT_HIST_FILE=null;
1620 BASE_HIST_FILE=null;
1621 INDEL_HIST_FILE=null;
1622 ERROR_HIST_FILE=null;
1623 LENGTH_HIST_FILE=null;
1624 GC_HIST_FILE=null;
1625 ENTROPY_HIST_FILE=null;
1626 IDENTITY_HIST_FILE=null;
1627 TIME_HIST_FILE=null;
1628 }
1629
1630 public static ArrayList<ReadStats> objectList=new ArrayList<ReadStats>();
1631 public static boolean COLLECT_QUALITY_STATS=false;
1632 public static boolean COLLECT_QUALITY_ACCURACY=false;
1633 public static boolean COLLECT_MATCH_STATS=false;
1634 public static boolean COLLECT_INSERT_STATS=false;
1635 public static boolean COLLECT_BASE_STATS=false;
1636 public static boolean COLLECT_INDEL_STATS=false;
1637 public static boolean COLLECT_GC_STATS=false;
1638 public static boolean COLLECT_ENTROPY_STATS=false;
1639 public static boolean COLLECT_ERROR_STATS=false;
1640 public static boolean COLLECT_LENGTH_STATS=false;
1641 public static boolean COLLECT_IDENTITY_STATS=false;
1642 public static boolean COLLECT_TIME_STATS=false;
1643
1644 public static boolean collectingStats(){
1645 return COLLECT_QUALITY_STATS || COLLECT_QUALITY_ACCURACY || COLLECT_MATCH_STATS || COLLECT_INSERT_STATS
1646 || COLLECT_BASE_STATS || COLLECT_INDEL_STATS || COLLECT_GC_STATS || COLLECT_ENTROPY_STATS
1647 || COLLECT_ERROR_STATS || COLLECT_LENGTH_STATS || COLLECT_IDENTITY_STATS || COLLECT_TIME_STATS;
1648 }
1649
1650 public static boolean usePairGC=true;
1651 public static boolean allowEntropyNs=true;
1652
1653 public static String AVG_QUAL_HIST_FILE=null;
1654 public static String QUAL_HIST_FILE=null;
1655 public static String BQUAL_HIST_FILE=null;
1656 public static String QUAL_COUNT_HIST_FILE=null;
1657 public static String BQUAL_HIST_OVERALL_FILE=null;
1658 public static String QUAL_ACCURACY_FILE=null;
1659 public static String MATCH_HIST_FILE=null;
1660 public static String INSERT_HIST_FILE=null;
1661 public static String BASE_HIST_FILE=null;
1662 public static String INDEL_HIST_FILE=null;
1663 public static String ERROR_HIST_FILE=null;
1664 public static String LENGTH_HIST_FILE=null;
1665 public static String GC_HIST_FILE=null;
1666 public static String ENTROPY_HIST_FILE=null;
1667 public static String IDENTITY_HIST_FILE=null;
1668 public static String TIME_HIST_FILE=null;
1669
1670 public static boolean overwrite=false;
1671 public static boolean append=false;
1672 public static final boolean verbose=false;
1673
1674 public static boolean skipZeroInsertCount=true;
1675 public static boolean skipZeroIndel=true;
1676
1677 }