comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package prok;
2
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.BitSet;
6 import java.util.HashMap;
7
8 import aligner.SingleStateAlignerFlat2;
9 import dna.AminoAcid;
10 import fileIO.FileFormat;
11 import gff.GffLine;
12 import shared.KillSwitch;
13 import shared.Shared;
14 import shared.Tools;
15 import stream.Read;
16 import stream.ReadInputStream;
17 import structures.ByteBuilder;
18 import structures.IntList;
19
20 /**
21 * This class is designed to store kmer frequencies related to gene
22 * starts, stops, and interiors. It can be loaded from a pgm file.
23 *
24 * It's possible to use multiple GeneModels; for example, one for
25 * each of several GC ranges or clades.
26 * @author Brian Bushnell
27 * @date Sep 24, 2018
28 *
29 */
30 public class GeneModel extends ProkObject {
31
32 /*--------------------------------------------------------------*/
33 /*---------------- Initialization ----------------*/
34 /*--------------------------------------------------------------*/
35
36 public GeneModel(boolean fill){
37 if(fill){
38 fillContainers();
39 }
40 }
41
42 void fillContainers(){
43 statsCDS.setInner(kInnerCDS, 3);
44 statsCDS.setStart(kStartCDS, startFrames, startLeftOffset);
45 statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset);
46
47 for(int i=0; i<rnaContainers.length; i++){
48 StatsContainer sc=rnaContainers[i];
49 sc.setInner(kInnerRNA, 1);
50 }
51
52 statstRNA.setStart(kStartRNA, 14, 4);
53 statstRNA.setStop(kStopRNA, 14, 6);
54
55 stats16S.setStart(kStartRNA, 20, 7);
56 stats16S.setStop(kStopRNA, 12, 16);
57
58 stats23S.setStart(kStartRNA, 17, 3);
59 stats23S.setStop(kStopRNA, 15, 12);
60
61 stats5S.setStart(kStartRNA, 20, 5);
62 stats5S.setStop(kStopRNA, 15, 5);
63
64 stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined
65 stats18S.setStop(kStopRNA, 12, 16);
66 }
67
68 /*--------------------------------------------------------------*/
69 /*---------------- Public Methods ----------------*/
70 /*--------------------------------------------------------------*/
71
72 public boolean process(String genomeFname, String gffFname){
73 // fnames.add(ReadWrite.stripPath(genomeFname));
74 numFiles++;
75 FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true);
76 FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true);
77
78 if(fnaFile==null || gffFile==null){
79 errorState=true;
80 return true;
81 }
82 filesProcessed++;
83
84 ArrayList<ScafData> scafList;
85 {//Scoped to save memory
86 ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads);
87 readsProcessed+=reads.size();
88 scafList=new ArrayList<ScafData>(reads.size());
89 for(Read r : reads){
90 basesProcessed+=r.length();
91 scafList.add(new ScafData(r));
92 }
93 }
94 {//Scoped to save memory
95 ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true);
96 ArrayList<GffLine> cds=allGffLines[0];
97 ArrayList<GffLine> rrna=allGffLines[1];
98 ArrayList<GffLine> trna=allGffLines[2];
99 genesProcessed+=cds.size();
100 genesProcessed+=(rrna==null ? 0 : rrna.size());
101 genesProcessed+=(trna==null ? 0 : trna.size());
102
103 HashMap<String, ScafData> scafMap=makeScafMap(scafList);
104 fillScafDataCDS(cds, scafMap);
105 fillScafDataRNA(rrna, scafMap);
106 fillScafDataRNA(trna, scafMap);
107 }
108
109 countBases(scafList);
110 if(PROCESS_PLUS_STRAND){
111 processStrand(scafList, Shared.PLUS);
112 }
113 if(PROCESS_MINUS_STRAND){
114 for(ScafData sd : scafList){
115 sd.clear();
116 sd.reverseComplement();
117 }
118 processStrand(scafList, Shared.MINUS);
119 for(ScafData sd : scafList){
120 sd.clear();
121 sd.reverseComplement();
122 }
123 }
124 return false;
125 }
126
127 public void add(GeneModel pgm){
128 for(int i=0; i<allContainers.length; i++){
129 // System.err.println("merging "+allContainers[i].name);
130 allContainers[i].add(pgm.allContainers[i]);
131 }
132
133 readsProcessed+=pgm.readsProcessed;
134 basesProcessed+=pgm.basesProcessed;
135 genesProcessed+=pgm.genesProcessed;
136 filesProcessed+=pgm.filesProcessed;
137
138 // geneStartsProcessed+=pgm.geneStartsProcessed;
139 // tRNAProcessed+=pgm.tRNAProcessed;
140 // r16SProcessed+=pgm.r16SProcessed;
141 // r23SProcessed+=pgm.r23SProcessed;
142 // r5SProcessed+=pgm.r5SProcessed;
143 // r18SProcessed+=pgm.r18SProcessed;
144
145 // fnames.addAll(pgm.fnames);
146 numFiles+=pgm.numFiles;
147 taxIds.addAll(pgm.taxIds);
148 Tools.add(baseCounts, pgm.baseCounts);
149 }
150
151 public void multiplyBy(double mult) {
152 for(int i=0; i<allContainers.length; i++){
153 allContainers[i].multiplyBy(mult);
154 }
155
156 readsProcessed=Math.round(readsProcessed*mult);
157 basesProcessed=Math.round(basesProcessed*mult);
158 genesProcessed=Math.round(genesProcessed*mult);
159 filesProcessed=Math.round(filesProcessed*mult);
160
161 for(int i=0; i<baseCounts.length; i++){
162 baseCounts[i]=Math.round(baseCounts[i]*mult);
163 }
164 }
165
166 /*--------------------------------------------------------------*/
167 /*---------------- Outer Methods ----------------*/
168 /*--------------------------------------------------------------*/
169
170 public float gc(){
171 long a=baseCounts[0];
172 long c=baseCounts[1];
173 long g=baseCounts[2];
174 long t=baseCounts[3];
175 return (float)((g+c)/Tools.max(1.0, a+t+g+c));
176 }
177
178 HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){
179 HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3);
180 for(ScafData sd : scafList){scafMap.put(sd.name, sd);}
181 for(ScafData sd : scafList){
182 String name=sd.name;
183 int idx=name.indexOf(' ');
184 if(idx>=0){
185 String prefix=name.substring(0, idx);
186 if(scafMap.containsKey(prefix)){
187 assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'";
188 }else{
189 scafMap.put(prefix, sd);
190 }
191 }
192 }
193 return scafMap;
194 }
195
196 public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){
197 if(!callCDS){return;}
198 for(GffLine gline : cdsLines){
199 ScafData sd=scafMap.get(gline.seqid);
200 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
201 sd.addCDS(gline);
202 }
203 }
204
205 public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){
206 for(GffLine gline : rnaLines){
207 ScafData sd=scafMap.get(gline.seqid);
208 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
209 if(processType(gline.prokType())){
210 sd.addRNA(gline);
211 }
212 }
213 }
214
215 public void processStrand(ArrayList<ScafData> scafList, int strand){
216 for(ScafData sd : scafList){
217 processCDS(sd, strand);
218 processRNA(sd, strand);
219 }
220 }
221
222 /*--------------------------------------------------------------*/
223 /*---------------- Inner Methods ----------------*/
224 /*--------------------------------------------------------------*/
225
226 private void countBases(ArrayList<ScafData> scafList){
227 for(ScafData sd : scafList){
228 countBases(sd.bases);
229 }
230 }
231
232 private void countBases(byte[] bases){
233 for(byte b : bases){
234 int x=AminoAcid.baseToNumberACGTother[b];
235 baseCounts[x]++;
236 }
237 }
238
239 /*--------------------------------------------------------------*/
240 /*---------------- Finding Codons ----------------*/
241 /*--------------------------------------------------------------*/
242
243 private static void findStopCodons(byte[] bases, IntList list, BitSet valid){
244 final int k=3;
245 final int mask=~((-1)<<(2*k));
246 int kmer=0;
247 int len=0;
248
249 for(int i=0; i<bases.length; i++){
250 byte b=bases[i];
251 int x=AminoAcid.baseToNumber[b];
252 kmer=((kmer<<2)|x)&mask;
253 if(x>=0){
254 len++;
255 if(len>=k){
256 int point=i;//End of the stop codon
257 if(isStopCodon(kmer) && !valid.get(point)){
258 list.add(point);
259 valid.set(point);
260 }
261 }
262 }else{len=0;}
263 }
264
265 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
266 if(!valid.get(i)){
267 list.add(i);
268 }
269 }
270 }
271
272 private static void findStartCodons(byte[] bases, IntList list, BitSet valid){
273 final int k=3;
274 final int mask=~((-1)<<(2*k));
275 int kmer=0;
276 int len=0;
277
278 for(int i=0; i<bases.length; i++){
279 byte b=bases[i];
280 int x=AminoAcid.baseToNumber[b];
281 kmer=((kmer<<2)|x)&mask;
282 if(x>=0){
283 len++;
284 if(len>=k){
285 int point=i-k+1;//Start of the start codon
286 if(isStartCodon(kmer) && !valid.get(point)){
287 list.add(point);
288 valid.set(point);
289 }
290 }
291 }else{len=0;}
292 }
293
294 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
295 if(!valid.get(i)){
296 list.add(i);
297 }
298 }
299 }
300
301 /*--------------------------------------------------------------*/
302 /*---------------- Processing GffLines ----------------*/
303 /*--------------------------------------------------------------*/
304
305 private static void processGene(GffLine gline, ScafData sd){
306 if(gline.length()<2){return;}
307 final int strand=gline.strand;
308 assert(strand==sd.strand());
309 final byte[] frames=sd.frames;
310 int start=gline.start-1, stop=gline.stop-1;
311 if(start<0 || stop>=sd.length()){return;}
312 // assert(start<stop) : gline; //Not always true for euks...
313 if(strand==Shared.MINUS){
314 int x=sd.length()-start-1;
315 int y=sd.length()-stop-1;
316 start=y;
317 stop=x;
318
319 // String a=new String(sd.bases, start, 3);
320 // String b=new String(sd.bases, stop-2, 3);
321 //// assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3);
322 // outstream.println(a+", "+b+", "+start+", "+stop);
323 }
324 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
325 markFrames(start, stop, frames, kInnerCDS);
326 sd.starts.add(start);
327 sd.stops.add(stop);
328 // assert(gline.start!=337) : gline+"\n"+start+", "+stop;
329 }
330
331 private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){
332 final int strand=gline.strand;
333 assert(strand==sd.strand());
334 final byte[] frames=sd.frames;
335 int start=gline.start-1, stop=gline.stop-1;
336 if(start<0 || stop>=sd.length()){return false;}
337 assert(start<=stop);
338 if(strand==Shared.MINUS){
339 int x=sd.length()-start-1;
340 int y=sd.length()-stop-1;
341 start=y;
342 stop=x;
343 }
344
345 if(AnalyzeGenes.alignRibo){
346 // byte[] seq=sd.fetch(start, stop);
347 Read[] consensusReads=ProkObject.consensusReads(type);
348 byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null);
349 float minIdentity=ProkObject.minID(type);
350 if(universal!=null){
351 int[] coords=KillSwitch.allocInt1D(2);
352 final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50));
353 final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50));
354 float id1=align(universal, sd.bases, a, b, minIdentity, coords);
355 final int rstart=coords[0], rstop=coords[1];
356 // assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
357 if(id1<minIdentity){
358 // System.err.println("Low identity: "+String.format("%.2s", 100*id1));
359 return false;
360 }else{
361 // System.err.println("Good identity: "+String.format("%.2s", 100*id1));
362 }
363 if(AnalyzeGenes.adjustEndpoints){
364 int startSlop=startSlop(type);
365 int stopSlop=stopSlop(type);
366 if(Tools.absdif(start, rstart)>startSlop){
367 // System.err.println("rstart:\t"+start+" -> "+rstart);
368 start=rstart;
369 }
370 if(Tools.absdif(stop, rstop)>stopSlop){
371 // System.err.println("rstop: \t"+stop+" -> "+rstop);
372 stop=rstop;
373 }
374 }
375 }
376 }
377
378 StatsContainer sc=allContainers[type];
379 sc.start.processPoint(sd.bases, start, 1);
380 sc.stop.processPoint(sd.bases, stop, 1);
381 assert(sc!=statsCDS);
382
383 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
384 final byte flag=typeToFlag(type);
385 for(int i=start+kInnerRNA-1; i<=stop; i++){
386 frames[i]|=flag;
387 }
388 return true;
389 }
390
391 private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){
392 // final int a=0, b=ref.length-1;
393 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA();
394 final int minScore=ssa.minScoreByIdentity(query.length, minIdentity);
395 int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore);
396 if(max==null){return 0;}
397
398 final int rows=max[0];
399 final int maxCol=max[1];
400 final int maxState=max[2];
401 // final int maxScore=max[3];
402 // final int maxStart=max[4];
403
404 //returns {score, bestRefStart, bestRefStop}
405 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
406 final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState);
407 final int rstart=Tools.max(score[1], 0);
408 final int rstop=Tools.min(score[2], ref.length-1);
409 if(coords!=null){
410 coords[0]=rstart;
411 coords[1]=rstop;
412 }
413 final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null);
414 return id;
415 }
416
417 /**
418 * Each frame byte has a bit marked for valid coding frames.
419 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon.
420 * If frames[23]=0, then no coding kmer end at that location on this strand.
421 * @param start
422 * @param stop
423 * @param frames
424 * @param k
425 */
426 private static void markFrames(int start, int stop, byte[] frames, int k){
427 assert(start<=stop) : start+", "+stop;
428 for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){
429 frames[i]=(byte)(frames[i]|frameBit);
430 frameBit<<=1;
431 if(frameBit>4){frameBit=1;}
432 }
433 // assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct
434 }
435
436 /*--------------------------------------------------------------*/
437 /*---------------- Counting Kmers ----------------*/
438 /*--------------------------------------------------------------*/
439
440 private void processCDS(ScafData sd, int strand){
441 if(!callCDS){return;}
442 ArrayList<GffLine> glines=sd.cdsLines[strand];
443 for(GffLine gline : glines){
444 assert(gline.strand==strand);
445 processGene(gline, sd);
446 statsCDS.addLength(gline.length());
447 }
448
449 statsCDS.inner.processCDSFrames(sd.bases, sd.frames);
450 BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1);
451 BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1);
452 // outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops.");
453 sd.clear();
454 findStartCodons(sd.bases, sd.starts, startSet);
455 findStopCodons(sd.bases, sd.stops, stopSet);
456 // outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops.");
457 processEnds(sd.bases, statsCDS.start, sd.starts, 0);
458 processEnds(sd.bases, statsCDS.stop, sd.stops, 0);
459 }
460
461 private static int getGlineType(GffLine gline, ScafData sd){
462 if(!gline.inbounds(sd.length()) || gline.partial()){return -1;}
463
464 final int length=gline.length();
465 final int type=gline.prokType();
466 if(type<0){
467 return type;
468 }else if(type==CDS){
469 return type;
470 }else if(type==tRNA && length>=40 && length<=120){
471 return type;
472 }else if(type==r16S && length>=1440 && length<=1620){
473 return type;
474 }else if(type==r23S && length>=2720 && length<=3170){
475 return type;
476 }else if(type==r5S && length>=90 && length<=150){
477 return type;
478 }else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range
479 return type;
480 }
481 return -1;
482 }
483
484 private void processRNA(ScafData sd, int strand){
485 sd.clear();
486 ArrayList<GffLine> lines=sd.rnaLines[strand];
487 for(GffLine gline : lines){
488 assert(gline.strand==strand);
489 final int type=getGlineType(gline, sd);
490 if(type>0){
491 StatsContainer sc=allContainers[type];
492 sc.addLength(gline.length());
493 processRnaLine(gline, sd, type);
494 }
495 }
496 processRnaInner(sd);
497 processRnaEnds(sd);
498 }
499
500 void processRnaInner(ScafData sd){
501 byte[] bases=sd.bases;
502 byte[] frames=sd.frames;
503 final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs.
504 final int mask=~((-1)<<(2*k));
505 int kmer=0;
506 int len=0;
507
508 for(int i=0; i<bases.length; i++){
509 byte b=bases[i];
510 int x=AminoAcid.baseToNumber[b];
511 kmer=((kmer<<2)|x)&mask;
512 if(x>=0){
513 len++;
514 if(len>=k){
515 int vf=frames[i];
516 for(int type=0; type<5; type++){
517 int valid=vf&1;
518 rnaContainers[type].inner.add(kmer, 0, valid);
519 vf=(vf>>1);
520 }
521 }
522 }else{len=0;}
523 }
524 }
525
526 void processRnaEnds(ScafData sd){
527 byte[] bases=sd.bases;
528
529 final int k=stats16S.start.k;
530 final int kMax=stats16S.start.kMax;
531 final int mask=stats16S.start.mask;
532 final long[] counts=new long[kMax];//TODO: Slow
533
534 int kmer=0;
535 int len=0;
536
537 for(int i=0; i<bases.length; i++){
538 byte b=bases[i];
539 int x=AminoAcid.baseToNumber[b];
540 kmer=((kmer<<2)|x)&mask;
541
542 if(x>=0){
543 len++;
544 if(len>=k){
545 counts[kmer]++;
546 }
547 }else{len=0;}
548 }
549 for(StatsContainer sc : rnaContainers){
550 FrameStats fs=sc.start;
551 for(long[] array : fs.countsFalse){
552 Tools.add(array, counts);
553 }
554 fs=sc.stop;
555 for(long[] array : fs.countsFalse){
556 Tools.add(array, counts);
557 }
558 }
559 }
560
561 private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){
562 BitSet points=new BitSet(bases.length);
563 for(int i=0; i<list.size; i++){
564 int point=list.get(i);
565 stats.processPoint(bases, list.get(i), valid);
566 points.set(point);
567 }
568 return points;
569 }
570
571 /*--------------------------------------------------------------*/
572 /*---------------- Scoring ----------------*/
573 /*--------------------------------------------------------------*/
574
575 // //Assumes bases are in the correct strand
576 // public float calcStartScore(int start, int stop, byte[] bases){
577 // float f=scorePoint(start, bases, startStats);
578 //// float ss=scoreStart2(start, bases, stop, innerKmerStats);
579 //// if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study.
580 // return f;
581 // }
582 //
583 // //Assumes bases are in the correct strand
584 // public float calcStopScore(int stop, byte[] bases){
585 // float f=scorePoint(stop, bases, stopStats);
586 // return f;
587 // }
588 //
589 // //Assumes bases are in the correct strand
590 // public float calcRnaStartScore(int start, int stop, byte[] bases){
591 // float f=scorePoint(start, bases, rrnaStartStats);
592 // return f;
593 // }
594 //
595 // //Assumes bases are in the correct strand
596 // public float calcRnaStopScore(int stop, byte[] bases){
597 // float f=scorePoint(stop, bases, rrnaStopStats);
598 // return f;
599 // }
600
601 // public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){
602 //
603 // assert(stats.frames==3);
604 // final int k=stats.k;
605 // final int mask=~((-1)<<(2*k));
606 //
607 // int kmer=0;
608 // int len=0;
609 // float score=0;
610 // int numKmers=0;
611 //
612 // for(int pos=start, currentFrame=startFrame; pos<stop; pos++){
613 // final byte b=bases[pos];
614 // final int x=AminoAcid.baseToNumber[b];
615 //
616 // if(x>=0){
617 // kmer=((kmer<<2)|x)&mask;
618 // len++;
619 // if(len>=k){
620 // float prob=stats.probs[currentFrame][kmer];
621 // float dif=prob-0.99f;//Prob above 1 is more likely than average
622 // score+=dif;
623 // numKmers++;
624 // }
625 // }else{
626 // len=0;
627 // kmer=0;
628 // }
629 //
630 // currentFrame++;
631 // if(currentFrame>2){currentFrame=0;}
632 // }
633 // return score/Tools.max(1f, numKmers);
634 // }
635 //
636 // /**
637 // * TODO
638 // * Evaluate the relative difference between left and right frequencies.
639 // * The purpose is to find locations where the left side looks noncoding and the right side looks coding.
640 // * Does not currently yield useful results.
641 // */
642 // public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){
643 // final int k=stats.k;
644 //
645 // int start=point-45;
646 // if(start<0 || stop>bases.length){return 0.5f;}
647 //
648 // float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats);
649 // float right=calcKmerScore(point, stop-3, 0, bases, stats);
650 // return right-left; //High numbers are likely to be starts; non-starts should be near 0.
651 // }
652
653 /*--------------------------------------------------------------*/
654 /*---------------- toString ----------------*/
655 /*--------------------------------------------------------------*/
656
657 @Override
658 public String toString(){
659 return appendTo(new ByteBuilder()).toString();
660 }
661
662 public ByteBuilder appendTo(ByteBuilder bb){
663
664 // Collections.sort(fnames);
665 taxIds.sort();
666
667 bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n");
668 bb.append("#files");
669 bb.tab().append(numFiles);
670 // if(fnames.size()>5){
671 // bb.tab().append(fnames.size());
672 // }else{
673 // for(String fname : fnames){
674 // bb.tab().append(fname);
675 // }
676 // }
677 bb.nl();
678 bb.append("#taxIDs");
679 for(int i=0; i<taxIds.size; i++){
680 bb.tab().append(taxIds.get(i));
681 }
682 bb.nl();
683 // bb.append("#k_inner\t").append(innerKmerLength).nl();
684 // bb.append("#k_end\t").append(endKmerLength).nl();
685 // bb.append("#start_left_offset\t").append(startLeftOffset).nl();
686 // bb.append("#start_right_offset\t").append(startRightOffset).nl();
687 // bb.append("#stop_left_offset\t").append(stopLeftOffset).nl();
688 // bb.append("#stop_right_offset\t").append(stopRightOffset).nl();
689 bb.append("#scaffolds\t").append(readsProcessed).nl();
690 bb.append("#bases\t").append(basesProcessed).nl();
691 bb.append("#genes\t").append(genesProcessed).nl();
692 bb.append("#GC\t").append(gc(),2).nl();
693 bb.append("#ACGTN");
694 for(long x : baseCounts){
695 bb.tab().append(x);
696 }
697 bb.nl();
698
699 for(StatsContainer sc : allContainers){
700 sc.appendTo(bb);
701 }
702 assert(allContainers.length>5) : allContainers.length;
703
704 return bb;
705 }
706
707 /*--------------------------------------------------------------*/
708 /*---------------- Stats ----------------*/
709 /*--------------------------------------------------------------*/
710
711 public final StatsContainer statsCDS=new StatsContainer(CDS);
712 public final StatsContainer statstRNA=new StatsContainer(tRNA);
713 public final StatsContainer stats16S=new StatsContainer(r16S);
714 public final StatsContainer stats23S=new StatsContainer(r23S);
715 public final StatsContainer stats5S=new StatsContainer(r5S);
716 public final StatsContainer stats18S=new StatsContainer(r18S);
717
718 final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S};
719 final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S};
720 //public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
721
722 // public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0);
723 // public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset);
724 // public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset);
725 //
726 // public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8);
727 // public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8);
728 //
729 // public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0);
730 // public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0);
731 // public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0);
732 // public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0);
733 // public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats};
734
735 /*--------------------------------------------------------------*/
736
737 // public ArrayList<String> fnames=new ArrayList<String>();
738 public int numFiles=0;
739 public IntList taxIds=new IntList();
740
741
742 /*--------------------------------------------------------------*/
743 /*---------------- Fields ----------------*/
744 /*--------------------------------------------------------------*/
745
746 private long maxReads=-1;
747
748 long readsProcessed=0;
749 long basesProcessed=0;
750 long genesProcessed=0;
751 long filesProcessed=0;
752 long[] baseCounts=new long[5];
753
754 /*--------------------------------------------------------------*/
755 /*---------------- Static Setters ----------------*/
756 /*--------------------------------------------------------------*/
757
758 public synchronized void setStatics(){
759 // assert(!setStatics);
760 kInnerCDS=statsCDS.inner.k;
761 kStartCDS=statsCDS.start.k;
762 kStopCDS=statsCDS.stop.k;
763
764 setStartLeftOffset(statsCDS.start.leftOffset);
765 setStartRightOffset(statsCDS.start.rightOffset());
766
767 setStopLeftOffset(statsCDS.stop.leftOffset);
768 setStopRightOffset(statsCDS.stop.rightOffset());
769
770 kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here?
771 kStartRNA=stats16S.start.k;
772 kStopRNA=stats16S.stop.k;
773 setStatics=true;
774 }
775
776 public static void setInnerK(int k){
777 kInnerCDS=k;
778 }
779
780 public static void setStartK(int k){
781 kStartCDS=k;
782 }
783
784 public static void setStopK(int k){
785 kStopCDS=k;
786 }
787
788 public static void setStartLeftOffset(int x){
789 startLeftOffset=x;
790 startFrames=startLeftOffset+startRightOffset+1;
791 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
792 }
793
794 public static void setStartRightOffset(int x){
795 startRightOffset=x;
796 startFrames=startLeftOffset+startRightOffset+1;
797 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
798 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
799 }
800
801 public static void setStopLeftOffset(int x){
802 stopLeftOffset=x;
803 stopFrames=stopLeftOffset+stopRightOffset+1;
804 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
805 }
806
807 public static void setStopRightOffset(int x){
808 stopRightOffset=x;
809 stopFrames=stopLeftOffset+stopRightOffset+1;
810 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
811 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
812 }
813
814 public static final boolean isStartCodon(int code){
815 return code>=0 && code<=63 && isStartCodon[code];
816 }
817 public static final boolean isStopCodon(int code){
818 return code>=0 && code<=63 && isStopCodon[code];
819 }
820
821 /*--------------------------------------------------------------*/
822 /*---------------- Class Init ----------------*/
823 /*--------------------------------------------------------------*/
824
825 private static boolean[] makeIsCodon(String[] codons){
826 boolean[] array=new boolean[64];
827 for(String s : codons){
828 int x=AminoAcid.toNumber(s);
829 array[x]=true;
830 }
831 return array;
832 }
833
834 public static int kInnerCDS=6;
835 public static int kStartCDS=3;
836 public static int kStopCDS=3;
837
838 static int startLeftOffset(){return startLeftOffset;}
839 static int startRightOffset(){return startRightOffset;}
840 static int startFrames(){return startFrames;}
841
842 private static int startLeftOffset=21; //21 works well for k=4
843 private static int startRightOffset=8; //10 works well for k=4
844 private static int startFrames=startLeftOffset+startRightOffset+1;
845
846 private static int stopLeftOffset=9;
847 private static int stopRightOffset=12;
848 private static int stopFrames=stopLeftOffset+stopRightOffset+1;
849
850 private static boolean setStatics=false;
851
852 /*--------------------------------------------------------------*/
853 /*---------------- More Statics ----------------*/
854 /*--------------------------------------------------------------*/
855
856 //E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9]
857 public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"};
858 public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"};
859 public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"};
860 public static boolean[] isStartCodon=makeIsCodon(startCodons);
861 public static boolean[] isStopCodon=makeIsCodon(stopCodons);
862
863 /*--------------------------------------------------------------*/
864
865 private static PrintStream outstream=System.err;
866 public static boolean verbose=false;
867 public static boolean errorState=false;
868
869 }