view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package prok;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;

import aligner.SingleStateAlignerFlat2;
import dna.AminoAcid;
import fileIO.FileFormat;
import gff.GffLine;
import shared.KillSwitch;
import shared.Shared;
import shared.Tools;
import stream.Read;
import stream.ReadInputStream;
import structures.ByteBuilder;
import structures.IntList;

/**
 * This class is designed to store kmer frequencies related to gene
 * starts, stops, and interiors.  It can be loaded from a pgm file.
 * 
 * It's possible to use multiple GeneModels; for example, one for
 * each of several GC ranges or clades.
 * @author Brian Bushnell
 * @date Sep 24, 2018
 *
 */
public class GeneModel extends ProkObject {
	
	/*--------------------------------------------------------------*/
	/*----------------        Initialization        ----------------*/
	/*--------------------------------------------------------------*/

	public GeneModel(boolean fill){
		if(fill){
			fillContainers();
		}
	}
	
	void fillContainers(){
		statsCDS.setInner(kInnerCDS, 3);
		statsCDS.setStart(kStartCDS, startFrames, startLeftOffset);
		statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset);

		for(int i=0; i<rnaContainers.length; i++){
			StatsContainer sc=rnaContainers[i];
			sc.setInner(kInnerRNA, 1);
		}

		statstRNA.setStart(kStartRNA, 14, 4);
		statstRNA.setStop(kStopRNA, 14, 6);

		stats16S.setStart(kStartRNA, 20, 7);
		stats16S.setStop(kStopRNA, 12, 16);

		stats23S.setStart(kStartRNA, 17, 3);
		stats23S.setStop(kStopRNA, 15, 12);

		stats5S.setStart(kStartRNA, 20, 5);
		stats5S.setStop(kStopRNA, 15, 5);

		stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined
		stats18S.setStop(kStopRNA, 12, 16);
	}
	
	/*--------------------------------------------------------------*/
	/*----------------        Public Methods        ----------------*/
	/*--------------------------------------------------------------*/
	
	public boolean process(String genomeFname, String gffFname){
//		fnames.add(ReadWrite.stripPath(genomeFname));
		numFiles++;
		FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true);
		FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true);
		
		if(fnaFile==null || gffFile==null){
			errorState=true;
			return true;
		}
		filesProcessed++;
		
		ArrayList<ScafData> scafList;
		{//Scoped to save memory
			ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads);
			readsProcessed+=reads.size();
			scafList=new ArrayList<ScafData>(reads.size());
			for(Read r : reads){
				basesProcessed+=r.length();
				scafList.add(new ScafData(r));
			}
		}
		{//Scoped to save memory
			ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true);
			ArrayList<GffLine> cds=allGffLines[0];
			ArrayList<GffLine> rrna=allGffLines[1];
			ArrayList<GffLine> trna=allGffLines[2];
			genesProcessed+=cds.size();
			genesProcessed+=(rrna==null ? 0 : rrna.size());
			genesProcessed+=(trna==null ? 0 : trna.size());
			
			HashMap<String, ScafData> scafMap=makeScafMap(scafList);
			fillScafDataCDS(cds, scafMap);
			fillScafDataRNA(rrna, scafMap);
			fillScafDataRNA(trna, scafMap);
		}
		
		countBases(scafList);
		if(PROCESS_PLUS_STRAND){
			processStrand(scafList, Shared.PLUS);
		}
		if(PROCESS_MINUS_STRAND){
			for(ScafData sd : scafList){
				sd.clear();
				sd.reverseComplement();
			}
			processStrand(scafList, Shared.MINUS);
			for(ScafData sd : scafList){
				sd.clear();
				sd.reverseComplement();
			}
		}
		return false;
	}
	
	public void add(GeneModel pgm){
		for(int i=0; i<allContainers.length; i++){
//			System.err.println("merging "+allContainers[i].name);
			allContainers[i].add(pgm.allContainers[i]);
		}
		
		readsProcessed+=pgm.readsProcessed;
		basesProcessed+=pgm.basesProcessed;
		genesProcessed+=pgm.genesProcessed;
		filesProcessed+=pgm.filesProcessed;

//		geneStartsProcessed+=pgm.geneStartsProcessed;
//		tRNAProcessed+=pgm.tRNAProcessed;
//		r16SProcessed+=pgm.r16SProcessed;
//		r23SProcessed+=pgm.r23SProcessed;
//		r5SProcessed+=pgm.r5SProcessed;
//		r18SProcessed+=pgm.r18SProcessed;
		
//		fnames.addAll(pgm.fnames);
		numFiles+=pgm.numFiles;
		taxIds.addAll(pgm.taxIds);
		Tools.add(baseCounts, pgm.baseCounts);
	}
	
	public void multiplyBy(double mult) {
		for(int i=0; i<allContainers.length; i++){
			allContainers[i].multiplyBy(mult);
		}
		
		readsProcessed=Math.round(readsProcessed*mult);
		basesProcessed=Math.round(basesProcessed*mult);
		genesProcessed=Math.round(genesProcessed*mult);
		filesProcessed=Math.round(filesProcessed*mult);
		
		for(int i=0; i<baseCounts.length; i++){
			baseCounts[i]=Math.round(baseCounts[i]*mult);
		}
	}
	
	/*--------------------------------------------------------------*/
	/*----------------        Outer Methods         ----------------*/
	/*--------------------------------------------------------------*/
	
	public float gc(){
		long a=baseCounts[0];
		long c=baseCounts[1];
		long g=baseCounts[2];
		long t=baseCounts[3];
		return (float)((g+c)/Tools.max(1.0, a+t+g+c));
	}
	
	HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){
		HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3);
		for(ScafData sd : scafList){scafMap.put(sd.name, sd);}
		for(ScafData sd : scafList){
			String name=sd.name;
			int idx=name.indexOf(' ');
			if(idx>=0){
				String prefix=name.substring(0, idx);
				if(scafMap.containsKey(prefix)){
					assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'";
				}else{
					scafMap.put(prefix, sd);
				}
			}
		}
		return scafMap;
	}
	
	public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){
		if(!callCDS){return;}
		for(GffLine gline : cdsLines){
			ScafData sd=scafMap.get(gline.seqid);
			assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
			sd.addCDS(gline);
		}
	}
	
	public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){
		for(GffLine gline : rnaLines){
			ScafData sd=scafMap.get(gline.seqid);
			assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
			if(processType(gline.prokType())){
				sd.addRNA(gline);
			}
		}
	}
	
	public void processStrand(ArrayList<ScafData> scafList, int strand){
		for(ScafData sd : scafList){
			processCDS(sd, strand);
			processRNA(sd, strand);
		}
	}
	
	/*--------------------------------------------------------------*/
	/*----------------        Inner Methods         ----------------*/
	/*--------------------------------------------------------------*/

	private void countBases(ArrayList<ScafData> scafList){
		for(ScafData sd : scafList){
			countBases(sd.bases);
		}
	}
	
	private void countBases(byte[] bases){
		for(byte b : bases){
			int x=AminoAcid.baseToNumberACGTother[b];
			baseCounts[x]++;
		}
	}
	
	/*--------------------------------------------------------------*/
	/*----------------        Finding Codons        ----------------*/
	/*--------------------------------------------------------------*/
	
	private static void findStopCodons(byte[] bases, IntList list, BitSet valid){
		final int k=3;
		final int mask=~((-1)<<(2*k));
		int kmer=0;
		int len=0;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			if(x>=0){
				len++;
				if(len>=k){
					int point=i;//End of the stop codon
					if(isStopCodon(kmer) && !valid.get(point)){
						list.add(point);
						valid.set(point);
					}
				}
			}else{len=0;}
		}
		
		for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
			if(!valid.get(i)){
				list.add(i);
			}
		}
	}
	
	private static void findStartCodons(byte[] bases, IntList list, BitSet valid){
		final int k=3;
		final int mask=~((-1)<<(2*k));
		int kmer=0;
		int len=0;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			if(x>=0){
				len++;
				if(len>=k){
					int point=i-k+1;//Start of the start codon
					if(isStartCodon(kmer) && !valid.get(point)){
						list.add(point);
						valid.set(point);
					}
				}
			}else{len=0;}
		}
		
		for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
			if(!valid.get(i)){
				list.add(i);
			}
		}
	}
	
	/*--------------------------------------------------------------*/
	/*----------------     Processing GffLines      ----------------*/
	/*--------------------------------------------------------------*/
	
	private static void processGene(GffLine gline, ScafData sd){
		if(gline.length()<2){return;}
		final int strand=gline.strand;
		assert(strand==sd.strand());
		final byte[] frames=sd.frames;
		int start=gline.start-1, stop=gline.stop-1;
		if(start<0 || stop>=sd.length()){return;}
//		assert(start<stop) : gline; //Not always true for euks...
		if(strand==Shared.MINUS){
			int x=sd.length()-start-1;
			int y=sd.length()-stop-1;
			start=y;
			stop=x;

//			String a=new String(sd.bases, start, 3);
//			String b=new String(sd.bases, stop-2, 3);
////			assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3);
//			outstream.println(a+", "+b+", "+start+", "+stop);
		}
		assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
		markFrames(start, stop, frames, kInnerCDS);
		sd.starts.add(start);
		sd.stops.add(stop);
//		assert(gline.start!=337) : gline+"\n"+start+", "+stop;
	}
	
	private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){
		final int strand=gline.strand;
		assert(strand==sd.strand());
		final byte[] frames=sd.frames;
		int start=gline.start-1, stop=gline.stop-1;
		if(start<0 || stop>=sd.length()){return false;}
		assert(start<=stop);
		if(strand==Shared.MINUS){
			int x=sd.length()-start-1;
			int y=sd.length()-stop-1;
			start=y;
			stop=x;
		}
		
		if(AnalyzeGenes.alignRibo){
//			byte[] seq=sd.fetch(start, stop);
			Read[] consensusReads=ProkObject.consensusReads(type);
			byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null);
			float minIdentity=ProkObject.minID(type);
			if(universal!=null){
				int[] coords=KillSwitch.allocInt1D(2);
				final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50));
				final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50));
				float id1=align(universal, sd.bases, a, b, minIdentity, coords);
				final int rstart=coords[0], rstop=coords[1];
//				assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
				if(id1<minIdentity){
//					System.err.println("Low identity: "+String.format("%.2s", 100*id1));
					return false;
				}else{
//					System.err.println("Good identity: "+String.format("%.2s", 100*id1));
				}
				if(AnalyzeGenes.adjustEndpoints){
					int startSlop=startSlop(type);
					int stopSlop=stopSlop(type);
					if(Tools.absdif(start, rstart)>startSlop){
//						System.err.println("rstart:\t"+start+" -> "+rstart);
						start=rstart;
					}
					if(Tools.absdif(stop, rstop)>stopSlop){
//						System.err.println("rstop: \t"+stop+" -> "+rstop);
						stop=rstop;
					}
				}
			}
		}
		
		StatsContainer sc=allContainers[type];
		sc.start.processPoint(sd.bases, start, 1);
		sc.stop.processPoint(sd.bases, stop, 1);
		assert(sc!=statsCDS);
		
		assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
		final byte flag=typeToFlag(type);
		for(int i=start+kInnerRNA-1; i<=stop; i++){
			frames[i]|=flag;
		}
		return true;
	}
	
	private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){
//		final int a=0, b=ref.length-1;
		SingleStateAlignerFlat2 ssa=GeneCaller.getSSA();
		final int minScore=ssa.minScoreByIdentity(query.length, minIdentity);
		int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore);
		if(max==null){return 0;}
		
		final int rows=max[0];
		final int maxCol=max[1];
		final int maxState=max[2];
//		final int maxScore=max[3];
//		final int maxStart=max[4];
		
		//returns {score, bestRefStart, bestRefStop} 
		//padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
		final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState);
		final int rstart=Tools.max(score[1], 0);
		final int rstop=Tools.min(score[2], ref.length-1);
		if(coords!=null){
			coords[0]=rstart;
			coords[1]=rstop;
		}
		final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null);
		return id;
	}
	
	/** 
	 * Each frame byte has a bit marked for valid coding frames.
	 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon.
	 * If frames[23]=0, then no coding kmer end at that location on this strand.
	 * @param start
	 * @param stop
	 * @param frames
	 * @param k
	 */
	private static void markFrames(int start, int stop, byte[] frames, int k){
		assert(start<=stop) : start+", "+stop;
		for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){
			frames[i]=(byte)(frames[i]|frameBit);
			frameBit<<=1;
			if(frameBit>4){frameBit=1;}
		}
//		assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct
	}
	
	/*--------------------------------------------------------------*/
	/*----------------        Counting Kmers        ----------------*/
	/*--------------------------------------------------------------*/
	
	private void processCDS(ScafData sd, int strand){
		if(!callCDS){return;}
		ArrayList<GffLine> glines=sd.cdsLines[strand];
		for(GffLine gline : glines){
			assert(gline.strand==strand);
			processGene(gline, sd);
			statsCDS.addLength(gline.length());
		}
		
		statsCDS.inner.processCDSFrames(sd.bases, sd.frames);
		BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1);
		BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1);
//		outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops.");
		sd.clear();
		findStartCodons(sd.bases, sd.starts, startSet);
		findStopCodons(sd.bases, sd.stops, stopSet);
//		outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops.");
		processEnds(sd.bases, statsCDS.start, sd.starts, 0);
		processEnds(sd.bases, statsCDS.stop, sd.stops, 0);
	}
	
	private static int getGlineType(GffLine gline, ScafData sd){
		if(!gline.inbounds(sd.length()) || gline.partial()){return -1;}

		final int length=gline.length();
		final int type=gline.prokType();
		if(type<0){
			return type;
		}else if(type==CDS){
			return type;
		}else if(type==tRNA && length>=40 && length<=120){
			return type;
		}else if(type==r16S && length>=1440 && length<=1620){
			return type;
		}else if(type==r23S && length>=2720 && length<=3170){
			return type;
		}else if(type==r5S && length>=90 && length<=150){
			return type;
		}else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range
			return type;
		}
		return -1;
	}
	
	private void processRNA(ScafData sd, int strand){
		sd.clear();
		ArrayList<GffLine> lines=sd.rnaLines[strand];
		for(GffLine gline : lines){
			assert(gline.strand==strand);
			final int type=getGlineType(gline, sd);
			if(type>0){
				StatsContainer sc=allContainers[type];
				sc.addLength(gline.length());
				processRnaLine(gline, sd, type);
			}
		}
		processRnaInner(sd);
		processRnaEnds(sd);
	}
	
	void processRnaInner(ScafData sd){
		byte[] bases=sd.bases;
		byte[] frames=sd.frames;
		final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs.
		final int mask=~((-1)<<(2*k));
		int kmer=0;
		int len=0;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			if(x>=0){
				len++;
				if(len>=k){
					int vf=frames[i];
					for(int type=0; type<5; type++){
						int valid=vf&1;
						rnaContainers[type].inner.add(kmer, 0, valid);
						vf=(vf>>1);
					}
				}
			}else{len=0;}
		}
	}
	
	void processRnaEnds(ScafData sd){
		byte[] bases=sd.bases;

		final int k=stats16S.start.k;
		final int kMax=stats16S.start.kMax;
		final int mask=stats16S.start.mask;
		final long[] counts=new long[kMax];//TODO: Slow
		
		int kmer=0;
		int len=0;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			
			if(x>=0){
				len++;
				if(len>=k){
					counts[kmer]++;
				}
			}else{len=0;}
		}
		for(StatsContainer sc : rnaContainers){
			FrameStats fs=sc.start;
			for(long[] array : fs.countsFalse){
				Tools.add(array, counts);
			}
			fs=sc.stop;
			for(long[] array : fs.countsFalse){
				Tools.add(array, counts);
			}
		}
	}
	
	private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){
		BitSet points=new BitSet(bases.length);
		for(int i=0; i<list.size; i++){
			int point=list.get(i);
			stats.processPoint(bases, list.get(i), valid);
			points.set(point);
		}
		return points;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------           Scoring            ----------------*/
	/*--------------------------------------------------------------*/
	
//	//Assumes bases are in the correct strand
//	public float calcStartScore(int start, int stop, byte[] bases){
//		float f=scorePoint(start, bases, startStats);
////		float ss=scoreStart2(start, bases, stop, innerKmerStats);
////		if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study.
//		return f;
//	}
//	
//	//Assumes bases are in the correct strand
//	public float calcStopScore(int stop, byte[] bases){
//		float f=scorePoint(stop, bases, stopStats);
//		return f;
//	}
//	
//	//Assumes bases are in the correct strand
//	public float calcRnaStartScore(int start, int stop, byte[] bases){
//		float f=scorePoint(start, bases, rrnaStartStats);
//		return f;
//	}
//	
//	//Assumes bases are in the correct strand
//	public float calcRnaStopScore(int stop, byte[] bases){
//		float f=scorePoint(stop, bases, rrnaStopStats);
//		return f;
//	}
	
//	public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){
//
//		assert(stats.frames==3);
//		final int k=stats.k;
//		final int mask=~((-1)<<(2*k));
//		
//		int kmer=0;
//		int len=0;
//		float score=0;
//		int numKmers=0;
//		
//		for(int pos=start, currentFrame=startFrame; pos<stop; pos++){
//			final byte b=bases[pos];
//			final int x=AminoAcid.baseToNumber[b];
//
//			if(x>=0){
//				kmer=((kmer<<2)|x)&mask;
//				len++;
//				if(len>=k){
//					float prob=stats.probs[currentFrame][kmer];
//					float dif=prob-0.99f;//Prob above 1 is more likely than average
//					score+=dif;
//					numKmers++;
//				}
//			}else{
//				len=0;
//				kmer=0;
//			}
//
//			currentFrame++;
//			if(currentFrame>2){currentFrame=0;}
//		}
//		return score/Tools.max(1f, numKmers);
//	}
//	
//	/**
//	 * TODO
//	 * Evaluate the relative difference between left and right frequencies.
//	 * The purpose is to find locations where the left side looks noncoding and the right side looks coding.
//	 * Does not currently yield useful results.
//	 */
//	public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){
//		final int k=stats.k;
//		
//		int start=point-45;
//		if(start<0 || stop>bases.length){return 0.5f;}
//
//		float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats);
//		float right=calcKmerScore(point, stop-3, 0, bases, stats);
//		return right-left; //High numbers are likely to be starts; non-starts should be near 0.
//	}
	
	/*--------------------------------------------------------------*/
	/*----------------           toString           ----------------*/
	/*--------------------------------------------------------------*/
	
	@Override
	public String toString(){
		return appendTo(new ByteBuilder()).toString();
	}
	
	public ByteBuilder appendTo(ByteBuilder bb){

//		Collections.sort(fnames);
		taxIds.sort();
		
		bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n");
		bb.append("#files");
		bb.tab().append(numFiles);
//		if(fnames.size()>5){
//			bb.tab().append(fnames.size());
//		}else{
//			for(String fname : fnames){
//				bb.tab().append(fname);
//			}
//		}
		bb.nl();
		bb.append("#taxIDs");
		for(int i=0; i<taxIds.size; i++){
			bb.tab().append(taxIds.get(i));
		}
		bb.nl();
//		bb.append("#k_inner\t").append(innerKmerLength).nl();
//		bb.append("#k_end\t").append(endKmerLength).nl();
//		bb.append("#start_left_offset\t").append(startLeftOffset).nl();
//		bb.append("#start_right_offset\t").append(startRightOffset).nl();
//		bb.append("#stop_left_offset\t").append(stopLeftOffset).nl();
//		bb.append("#stop_right_offset\t").append(stopRightOffset).nl();
		bb.append("#scaffolds\t").append(readsProcessed).nl();
		bb.append("#bases\t").append(basesProcessed).nl();
		bb.append("#genes\t").append(genesProcessed).nl();
		bb.append("#GC\t").append(gc(),2).nl();
		bb.append("#ACGTN");
		for(long x : baseCounts){
			bb.tab().append(x);
		}
		bb.nl();

		for(StatsContainer sc : allContainers){
			sc.appendTo(bb);
		}
		assert(allContainers.length>5) : allContainers.length;
		
		return bb;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------            Stats             ----------------*/
	/*--------------------------------------------------------------*/

	public final StatsContainer statsCDS=new StatsContainer(CDS);
	public final StatsContainer statstRNA=new StatsContainer(tRNA);
	public final StatsContainer stats16S=new StatsContainer(r16S);
	public final StatsContainer stats23S=new StatsContainer(r23S);
	public final StatsContainer stats5S=new StatsContainer(r5S);
	public final StatsContainer stats18S=new StatsContainer(r18S);
	
	final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S};
	final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S};
	//public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
	
//	public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0);
//	public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset);
//	public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset);
//
//	public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8);
//	public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8);
//	
//	public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0);
//	public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0);
//	public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0);
//	public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0);
//	public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats};
	
	/*--------------------------------------------------------------*/
	
//	public ArrayList<String> fnames=new ArrayList<String>();
	public int numFiles=0;
	public IntList taxIds=new IntList();

	
	/*--------------------------------------------------------------*/
	/*----------------            Fields            ----------------*/
	/*--------------------------------------------------------------*/

	private long maxReads=-1;
	
	long readsProcessed=0;
	long basesProcessed=0;
	long genesProcessed=0;
	long filesProcessed=0;
	long[] baseCounts=new long[5];
	
	/*--------------------------------------------------------------*/
	/*----------------        Static Setters        ----------------*/
	/*--------------------------------------------------------------*/

	public synchronized void setStatics(){
//		assert(!setStatics);
		kInnerCDS=statsCDS.inner.k;
		kStartCDS=statsCDS.start.k;
		kStopCDS=statsCDS.stop.k;
		
		setStartLeftOffset(statsCDS.start.leftOffset);
		setStartRightOffset(statsCDS.start.rightOffset());
		
		setStopLeftOffset(statsCDS.stop.leftOffset);
		setStopRightOffset(statsCDS.stop.rightOffset());
		
		kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here?
		kStartRNA=stats16S.start.k;
		kStopRNA=stats16S.stop.k;
		setStatics=true;
	}
	
	public static void setInnerK(int k){
		kInnerCDS=k;
	}
	
	public static void setStartK(int k){
		kStartCDS=k;
	}
	
	public static void setStopK(int k){
		kStopCDS=k;
	}
	
	public static void setStartLeftOffset(int x){
		startLeftOffset=x;
		startFrames=startLeftOffset+startRightOffset+1;
//		System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
	}
	
	public static void setStartRightOffset(int x){
		startRightOffset=x;
		startFrames=startLeftOffset+startRightOffset+1;
//		System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
//		assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
	}
	
	public static void setStopLeftOffset(int x){
		stopLeftOffset=x;
		stopFrames=stopLeftOffset+stopRightOffset+1;
//		System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
	}
	
	public static void setStopRightOffset(int x){
		stopRightOffset=x;
		stopFrames=stopLeftOffset+stopRightOffset+1;
//		System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
//		assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
	}
	
	public static final boolean isStartCodon(int code){
		return code>=0 && code<=63 && isStartCodon[code];
	}
	public static final boolean isStopCodon(int code){
		return code>=0 && code<=63 && isStopCodon[code];
	}
	
	/*--------------------------------------------------------------*/
	/*----------------          Class Init          ----------------*/
	/*--------------------------------------------------------------*/
	
	private static boolean[] makeIsCodon(String[] codons){
		boolean[] array=new boolean[64];
		for(String s : codons){
			int x=AminoAcid.toNumber(s);
			array[x]=true;
		}
		return array;
	}
	
	public static int kInnerCDS=6;
	public static int kStartCDS=3;
	public static int kStopCDS=3;
	
	static int startLeftOffset(){return startLeftOffset;}
	static int startRightOffset(){return startRightOffset;}
	static int startFrames(){return startFrames;}
	
	private static int startLeftOffset=21; //21 works well for k=4
	private static int startRightOffset=8; //10 works well for k=4
	private static int startFrames=startLeftOffset+startRightOffset+1;
	
	private static int stopLeftOffset=9;
	private static int stopRightOffset=12;
	private static int stopFrames=stopLeftOffset+stopRightOffset+1;
	
	private static boolean setStatics=false;
	
	/*--------------------------------------------------------------*/
	/*----------------         More Statics         ----------------*/
	/*--------------------------------------------------------------*/
	
	//E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9]
	public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"};
	public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"};
	public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"};
	public static boolean[] isStartCodon=makeIsCodon(startCodons);
	public static boolean[] isStopCodon=makeIsCodon(stopCodons);
	
	/*--------------------------------------------------------------*/
	
	private static PrintStream outstream=System.err;
	public static boolean verbose=false;
	public static boolean errorState=false;
	
}