diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,869 @@
+package prok;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+
+import aligner.SingleStateAlignerFlat2;
+import dna.AminoAcid;
+import fileIO.FileFormat;
+import gff.GffLine;
+import shared.KillSwitch;
+import shared.Shared;
+import shared.Tools;
+import stream.Read;
+import stream.ReadInputStream;
+import structures.ByteBuilder;
+import structures.IntList;
+
+/**
+ * This class is designed to store kmer frequencies related to gene
+ * starts, stops, and interiors.  It can be loaded from a pgm file.
+ * 
+ * It's possible to use multiple GeneModels; for example, one for
+ * each of several GC ranges or clades.
+ * @author Brian Bushnell
+ * @date Sep 24, 2018
+ *
+ */
+public class GeneModel extends ProkObject {
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+
+	public GeneModel(boolean fill){
+		if(fill){
+			fillContainers();
+		}
+	}
+	
+	void fillContainers(){
+		statsCDS.setInner(kInnerCDS, 3);
+		statsCDS.setStart(kStartCDS, startFrames, startLeftOffset);
+		statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset);
+
+		for(int i=0; i<rnaContainers.length; i++){
+			StatsContainer sc=rnaContainers[i];
+			sc.setInner(kInnerRNA, 1);
+		}
+
+		statstRNA.setStart(kStartRNA, 14, 4);
+		statstRNA.setStop(kStopRNA, 14, 6);
+
+		stats16S.setStart(kStartRNA, 20, 7);
+		stats16S.setStop(kStopRNA, 12, 16);
+
+		stats23S.setStart(kStartRNA, 17, 3);
+		stats23S.setStop(kStopRNA, 15, 12);
+
+		stats5S.setStart(kStartRNA, 20, 5);
+		stats5S.setStop(kStopRNA, 15, 5);
+
+		stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined
+		stats18S.setStop(kStopRNA, 12, 16);
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Public Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	public boolean process(String genomeFname, String gffFname){
+//		fnames.add(ReadWrite.stripPath(genomeFname));
+		numFiles++;
+		FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true);
+		FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true);
+		
+		if(fnaFile==null || gffFile==null){
+			errorState=true;
+			return true;
+		}
+		filesProcessed++;
+		
+		ArrayList<ScafData> scafList;
+		{//Scoped to save memory
+			ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads);
+			readsProcessed+=reads.size();
+			scafList=new ArrayList<ScafData>(reads.size());
+			for(Read r : reads){
+				basesProcessed+=r.length();
+				scafList.add(new ScafData(r));
+			}
+		}
+		{//Scoped to save memory
+			ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true);
+			ArrayList<GffLine> cds=allGffLines[0];
+			ArrayList<GffLine> rrna=allGffLines[1];
+			ArrayList<GffLine> trna=allGffLines[2];
+			genesProcessed+=cds.size();
+			genesProcessed+=(rrna==null ? 0 : rrna.size());
+			genesProcessed+=(trna==null ? 0 : trna.size());
+			
+			HashMap<String, ScafData> scafMap=makeScafMap(scafList);
+			fillScafDataCDS(cds, scafMap);
+			fillScafDataRNA(rrna, scafMap);
+			fillScafDataRNA(trna, scafMap);
+		}
+		
+		countBases(scafList);
+		if(PROCESS_PLUS_STRAND){
+			processStrand(scafList, Shared.PLUS);
+		}
+		if(PROCESS_MINUS_STRAND){
+			for(ScafData sd : scafList){
+				sd.clear();
+				sd.reverseComplement();
+			}
+			processStrand(scafList, Shared.MINUS);
+			for(ScafData sd : scafList){
+				sd.clear();
+				sd.reverseComplement();
+			}
+		}
+		return false;
+	}
+	
+	public void add(GeneModel pgm){
+		for(int i=0; i<allContainers.length; i++){
+//			System.err.println("merging "+allContainers[i].name);
+			allContainers[i].add(pgm.allContainers[i]);
+		}
+		
+		readsProcessed+=pgm.readsProcessed;
+		basesProcessed+=pgm.basesProcessed;
+		genesProcessed+=pgm.genesProcessed;
+		filesProcessed+=pgm.filesProcessed;
+
+//		geneStartsProcessed+=pgm.geneStartsProcessed;
+//		tRNAProcessed+=pgm.tRNAProcessed;
+//		r16SProcessed+=pgm.r16SProcessed;
+//		r23SProcessed+=pgm.r23SProcessed;
+//		r5SProcessed+=pgm.r5SProcessed;
+//		r18SProcessed+=pgm.r18SProcessed;
+		
+//		fnames.addAll(pgm.fnames);
+		numFiles+=pgm.numFiles;
+		taxIds.addAll(pgm.taxIds);
+		Tools.add(baseCounts, pgm.baseCounts);
+	}
+	
+	public void multiplyBy(double mult) {
+		for(int i=0; i<allContainers.length; i++){
+			allContainers[i].multiplyBy(mult);
+		}
+		
+		readsProcessed=Math.round(readsProcessed*mult);
+		basesProcessed=Math.round(basesProcessed*mult);
+		genesProcessed=Math.round(genesProcessed*mult);
+		filesProcessed=Math.round(filesProcessed*mult);
+		
+		for(int i=0; i<baseCounts.length; i++){
+			baseCounts[i]=Math.round(baseCounts[i]*mult);
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Outer Methods         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	public float gc(){
+		long a=baseCounts[0];
+		long c=baseCounts[1];
+		long g=baseCounts[2];
+		long t=baseCounts[3];
+		return (float)((g+c)/Tools.max(1.0, a+t+g+c));
+	}
+	
+	HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){
+		HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3);
+		for(ScafData sd : scafList){scafMap.put(sd.name, sd);}
+		for(ScafData sd : scafList){
+			String name=sd.name;
+			int idx=name.indexOf(' ');
+			if(idx>=0){
+				String prefix=name.substring(0, idx);
+				if(scafMap.containsKey(prefix)){
+					assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'";
+				}else{
+					scafMap.put(prefix, sd);
+				}
+			}
+		}
+		return scafMap;
+	}
+	
+	public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){
+		if(!callCDS){return;}
+		for(GffLine gline : cdsLines){
+			ScafData sd=scafMap.get(gline.seqid);
+			assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
+			sd.addCDS(gline);
+		}
+	}
+	
+	public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){
+		for(GffLine gline : rnaLines){
+			ScafData sd=scafMap.get(gline.seqid);
+			assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
+			if(processType(gline.prokType())){
+				sd.addRNA(gline);
+			}
+		}
+	}
+	
+	public void processStrand(ArrayList<ScafData> scafList, int strand){
+		for(ScafData sd : scafList){
+			processCDS(sd, strand);
+			processRNA(sd, strand);
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Inner Methods         ----------------*/
+	/*--------------------------------------------------------------*/
+
+	private void countBases(ArrayList<ScafData> scafList){
+		for(ScafData sd : scafList){
+			countBases(sd.bases);
+		}
+	}
+	
+	private void countBases(byte[] bases){
+		for(byte b : bases){
+			int x=AminoAcid.baseToNumberACGTother[b];
+			baseCounts[x]++;
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Finding Codons        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private static void findStopCodons(byte[] bases, IntList list, BitSet valid){
+		final int k=3;
+		final int mask=~((-1)<<(2*k));
+		int kmer=0;
+		int len=0;
+		
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			int x=AminoAcid.baseToNumber[b];
+			kmer=((kmer<<2)|x)&mask;
+			if(x>=0){
+				len++;
+				if(len>=k){
+					int point=i;//End of the stop codon
+					if(isStopCodon(kmer) && !valid.get(point)){
+						list.add(point);
+						valid.set(point);
+					}
+				}
+			}else{len=0;}
+		}
+		
+		for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
+			if(!valid.get(i)){
+				list.add(i);
+			}
+		}
+	}
+	
+	private static void findStartCodons(byte[] bases, IntList list, BitSet valid){
+		final int k=3;
+		final int mask=~((-1)<<(2*k));
+		int kmer=0;
+		int len=0;
+		
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			int x=AminoAcid.baseToNumber[b];
+			kmer=((kmer<<2)|x)&mask;
+			if(x>=0){
+				len++;
+				if(len>=k){
+					int point=i-k+1;//Start of the start codon
+					if(isStartCodon(kmer) && !valid.get(point)){
+						list.add(point);
+						valid.set(point);
+					}
+				}
+			}else{len=0;}
+		}
+		
+		for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
+			if(!valid.get(i)){
+				list.add(i);
+			}
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------     Processing GffLines      ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private static void processGene(GffLine gline, ScafData sd){
+		if(gline.length()<2){return;}
+		final int strand=gline.strand;
+		assert(strand==sd.strand());
+		final byte[] frames=sd.frames;
+		int start=gline.start-1, stop=gline.stop-1;
+		if(start<0 || stop>=sd.length()){return;}
+//		assert(start<stop) : gline; //Not always true for euks...
+		if(strand==Shared.MINUS){
+			int x=sd.length()-start-1;
+			int y=sd.length()-stop-1;
+			start=y;
+			stop=x;
+
+//			String a=new String(sd.bases, start, 3);
+//			String b=new String(sd.bases, stop-2, 3);
+////			assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3);
+//			outstream.println(a+", "+b+", "+start+", "+stop);
+		}
+		assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
+		markFrames(start, stop, frames, kInnerCDS);
+		sd.starts.add(start);
+		sd.stops.add(stop);
+//		assert(gline.start!=337) : gline+"\n"+start+", "+stop;
+	}
+	
+	private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){
+		final int strand=gline.strand;
+		assert(strand==sd.strand());
+		final byte[] frames=sd.frames;
+		int start=gline.start-1, stop=gline.stop-1;
+		if(start<0 || stop>=sd.length()){return false;}
+		assert(start<=stop);
+		if(strand==Shared.MINUS){
+			int x=sd.length()-start-1;
+			int y=sd.length()-stop-1;
+			start=y;
+			stop=x;
+		}
+		
+		if(AnalyzeGenes.alignRibo){
+//			byte[] seq=sd.fetch(start, stop);
+			Read[] consensusReads=ProkObject.consensusReads(type);
+			byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null);
+			float minIdentity=ProkObject.minID(type);
+			if(universal!=null){
+				int[] coords=KillSwitch.allocInt1D(2);
+				final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50));
+				final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50));
+				float id1=align(universal, sd.bases, a, b, minIdentity, coords);
+				final int rstart=coords[0], rstop=coords[1];
+//				assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
+				if(id1<minIdentity){
+//					System.err.println("Low identity: "+String.format("%.2s", 100*id1));
+					return false;
+				}else{
+//					System.err.println("Good identity: "+String.format("%.2s", 100*id1));
+				}
+				if(AnalyzeGenes.adjustEndpoints){
+					int startSlop=startSlop(type);
+					int stopSlop=stopSlop(type);
+					if(Tools.absdif(start, rstart)>startSlop){
+//						System.err.println("rstart:\t"+start+" -> "+rstart);
+						start=rstart;
+					}
+					if(Tools.absdif(stop, rstop)>stopSlop){
+//						System.err.println("rstop: \t"+stop+" -> "+rstop);
+						stop=rstop;
+					}
+				}
+			}
+		}
+		
+		StatsContainer sc=allContainers[type];
+		sc.start.processPoint(sd.bases, start, 1);
+		sc.stop.processPoint(sd.bases, stop, 1);
+		assert(sc!=statsCDS);
+		
+		assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
+		final byte flag=typeToFlag(type);
+		for(int i=start+kInnerRNA-1; i<=stop; i++){
+			frames[i]|=flag;
+		}
+		return true;
+	}
+	
+	private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){
+//		final int a=0, b=ref.length-1;
+		SingleStateAlignerFlat2 ssa=GeneCaller.getSSA();
+		final int minScore=ssa.minScoreByIdentity(query.length, minIdentity);
+		int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore);
+		if(max==null){return 0;}
+		
+		final int rows=max[0];
+		final int maxCol=max[1];
+		final int maxState=max[2];
+//		final int maxScore=max[3];
+//		final int maxStart=max[4];
+		
+		//returns {score, bestRefStart, bestRefStop} 
+		//padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
+		final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState);
+		final int rstart=Tools.max(score[1], 0);
+		final int rstop=Tools.min(score[2], ref.length-1);
+		if(coords!=null){
+			coords[0]=rstart;
+			coords[1]=rstop;
+		}
+		final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null);
+		return id;
+	}
+	
+	/** 
+	 * Each frame byte has a bit marked for valid coding frames.
+	 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon.
+	 * If frames[23]=0, then no coding kmer end at that location on this strand.
+	 * @param start
+	 * @param stop
+	 * @param frames
+	 * @param k
+	 */
+	private static void markFrames(int start, int stop, byte[] frames, int k){
+		assert(start<=stop) : start+", "+stop;
+		for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){
+			frames[i]=(byte)(frames[i]|frameBit);
+			frameBit<<=1;
+			if(frameBit>4){frameBit=1;}
+		}
+//		assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Counting Kmers        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private void processCDS(ScafData sd, int strand){
+		if(!callCDS){return;}
+		ArrayList<GffLine> glines=sd.cdsLines[strand];
+		for(GffLine gline : glines){
+			assert(gline.strand==strand);
+			processGene(gline, sd);
+			statsCDS.addLength(gline.length());
+		}
+		
+		statsCDS.inner.processCDSFrames(sd.bases, sd.frames);
+		BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1);
+		BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1);
+//		outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops.");
+		sd.clear();
+		findStartCodons(sd.bases, sd.starts, startSet);
+		findStopCodons(sd.bases, sd.stops, stopSet);
+//		outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops.");
+		processEnds(sd.bases, statsCDS.start, sd.starts, 0);
+		processEnds(sd.bases, statsCDS.stop, sd.stops, 0);
+	}
+	
+	private static int getGlineType(GffLine gline, ScafData sd){
+		if(!gline.inbounds(sd.length()) || gline.partial()){return -1;}
+
+		final int length=gline.length();
+		final int type=gline.prokType();
+		if(type<0){
+			return type;
+		}else if(type==CDS){
+			return type;
+		}else if(type==tRNA && length>=40 && length<=120){
+			return type;
+		}else if(type==r16S && length>=1440 && length<=1620){
+			return type;
+		}else if(type==r23S && length>=2720 && length<=3170){
+			return type;
+		}else if(type==r5S && length>=90 && length<=150){
+			return type;
+		}else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range
+			return type;
+		}
+		return -1;
+	}
+	
+	private void processRNA(ScafData sd, int strand){
+		sd.clear();
+		ArrayList<GffLine> lines=sd.rnaLines[strand];
+		for(GffLine gline : lines){
+			assert(gline.strand==strand);
+			final int type=getGlineType(gline, sd);
+			if(type>0){
+				StatsContainer sc=allContainers[type];
+				sc.addLength(gline.length());
+				processRnaLine(gline, sd, type);
+			}
+		}
+		processRnaInner(sd);
+		processRnaEnds(sd);
+	}
+	
+	void processRnaInner(ScafData sd){
+		byte[] bases=sd.bases;
+		byte[] frames=sd.frames;
+		final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs.
+		final int mask=~((-1)<<(2*k));
+		int kmer=0;
+		int len=0;
+		
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			int x=AminoAcid.baseToNumber[b];
+			kmer=((kmer<<2)|x)&mask;
+			if(x>=0){
+				len++;
+				if(len>=k){
+					int vf=frames[i];
+					for(int type=0; type<5; type++){
+						int valid=vf&1;
+						rnaContainers[type].inner.add(kmer, 0, valid);
+						vf=(vf>>1);
+					}
+				}
+			}else{len=0;}
+		}
+	}
+	
+	void processRnaEnds(ScafData sd){
+		byte[] bases=sd.bases;
+
+		final int k=stats16S.start.k;
+		final int kMax=stats16S.start.kMax;
+		final int mask=stats16S.start.mask;
+		final long[] counts=new long[kMax];//TODO: Slow
+		
+		int kmer=0;
+		int len=0;
+		
+		for(int i=0; i<bases.length; i++){
+			byte b=bases[i];
+			int x=AminoAcid.baseToNumber[b];
+			kmer=((kmer<<2)|x)&mask;
+			
+			if(x>=0){
+				len++;
+				if(len>=k){
+					counts[kmer]++;
+				}
+			}else{len=0;}
+		}
+		for(StatsContainer sc : rnaContainers){
+			FrameStats fs=sc.start;
+			for(long[] array : fs.countsFalse){
+				Tools.add(array, counts);
+			}
+			fs=sc.stop;
+			for(long[] array : fs.countsFalse){
+				Tools.add(array, counts);
+			}
+		}
+	}
+	
+	private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){
+		BitSet points=new BitSet(bases.length);
+		for(int i=0; i<list.size; i++){
+			int point=list.get(i);
+			stats.processPoint(bases, list.get(i), valid);
+			points.set(point);
+		}
+		return points;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------           Scoring            ----------------*/
+	/*--------------------------------------------------------------*/
+	
+//	//Assumes bases are in the correct strand
+//	public float calcStartScore(int start, int stop, byte[] bases){
+//		float f=scorePoint(start, bases, startStats);
+////		float ss=scoreStart2(start, bases, stop, innerKmerStats);
+////		if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study.
+//		return f;
+//	}
+//	
+//	//Assumes bases are in the correct strand
+//	public float calcStopScore(int stop, byte[] bases){
+//		float f=scorePoint(stop, bases, stopStats);
+//		return f;
+//	}
+//	
+//	//Assumes bases are in the correct strand
+//	public float calcRnaStartScore(int start, int stop, byte[] bases){
+//		float f=scorePoint(start, bases, rrnaStartStats);
+//		return f;
+//	}
+//	
+//	//Assumes bases are in the correct strand
+//	public float calcRnaStopScore(int stop, byte[] bases){
+//		float f=scorePoint(stop, bases, rrnaStopStats);
+//		return f;
+//	}
+	
+//	public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){
+//
+//		assert(stats.frames==3);
+//		final int k=stats.k;
+//		final int mask=~((-1)<<(2*k));
+//		
+//		int kmer=0;
+//		int len=0;
+//		float score=0;
+//		int numKmers=0;
+//		
+//		for(int pos=start, currentFrame=startFrame; pos<stop; pos++){
+//			final byte b=bases[pos];
+//			final int x=AminoAcid.baseToNumber[b];
+//
+//			if(x>=0){
+//				kmer=((kmer<<2)|x)&mask;
+//				len++;
+//				if(len>=k){
+//					float prob=stats.probs[currentFrame][kmer];
+//					float dif=prob-0.99f;//Prob above 1 is more likely than average
+//					score+=dif;
+//					numKmers++;
+//				}
+//			}else{
+//				len=0;
+//				kmer=0;
+//			}
+//
+//			currentFrame++;
+//			if(currentFrame>2){currentFrame=0;}
+//		}
+//		return score/Tools.max(1f, numKmers);
+//	}
+//	
+//	/**
+//	 * TODO
+//	 * Evaluate the relative difference between left and right frequencies.
+//	 * The purpose is to find locations where the left side looks noncoding and the right side looks coding.
+//	 * Does not currently yield useful results.
+//	 */
+//	public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){
+//		final int k=stats.k;
+//		
+//		int start=point-45;
+//		if(start<0 || stop>bases.length){return 0.5f;}
+//
+//		float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats);
+//		float right=calcKmerScore(point, stop-3, 0, bases, stats);
+//		return right-left; //High numbers are likely to be starts; non-starts should be near 0.
+//	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------           toString           ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	@Override
+	public String toString(){
+		return appendTo(new ByteBuilder()).toString();
+	}
+	
+	public ByteBuilder appendTo(ByteBuilder bb){
+
+//		Collections.sort(fnames);
+		taxIds.sort();
+		
+		bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n");
+		bb.append("#files");
+		bb.tab().append(numFiles);
+//		if(fnames.size()>5){
+//			bb.tab().append(fnames.size());
+//		}else{
+//			for(String fname : fnames){
+//				bb.tab().append(fname);
+//			}
+//		}
+		bb.nl();
+		bb.append("#taxIDs");
+		for(int i=0; i<taxIds.size; i++){
+			bb.tab().append(taxIds.get(i));
+		}
+		bb.nl();
+//		bb.append("#k_inner\t").append(innerKmerLength).nl();
+//		bb.append("#k_end\t").append(endKmerLength).nl();
+//		bb.append("#start_left_offset\t").append(startLeftOffset).nl();
+//		bb.append("#start_right_offset\t").append(startRightOffset).nl();
+//		bb.append("#stop_left_offset\t").append(stopLeftOffset).nl();
+//		bb.append("#stop_right_offset\t").append(stopRightOffset).nl();
+		bb.append("#scaffolds\t").append(readsProcessed).nl();
+		bb.append("#bases\t").append(basesProcessed).nl();
+		bb.append("#genes\t").append(genesProcessed).nl();
+		bb.append("#GC\t").append(gc(),2).nl();
+		bb.append("#ACGTN");
+		for(long x : baseCounts){
+			bb.tab().append(x);
+		}
+		bb.nl();
+
+		for(StatsContainer sc : allContainers){
+			sc.appendTo(bb);
+		}
+		assert(allContainers.length>5) : allContainers.length;
+		
+		return bb;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Stats             ----------------*/
+	/*--------------------------------------------------------------*/
+
+	public final StatsContainer statsCDS=new StatsContainer(CDS);
+	public final StatsContainer statstRNA=new StatsContainer(tRNA);
+	public final StatsContainer stats16S=new StatsContainer(r16S);
+	public final StatsContainer stats23S=new StatsContainer(r23S);
+	public final StatsContainer stats5S=new StatsContainer(r5S);
+	public final StatsContainer stats18S=new StatsContainer(r18S);
+	
+	final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S};
+	final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S};
+	//public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
+	
+//	public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0);
+//	public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset);
+//	public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset);
+//
+//	public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8);
+//	public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8);
+//	
+//	public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0);
+//	public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0);
+//	public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0);
+//	public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0);
+//	public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats};
+	
+	/*--------------------------------------------------------------*/
+	
+//	public ArrayList<String> fnames=new ArrayList<String>();
+	public int numFiles=0;
+	public IntList taxIds=new IntList();
+
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+
+	private long maxReads=-1;
+	
+	long readsProcessed=0;
+	long basesProcessed=0;
+	long genesProcessed=0;
+	long filesProcessed=0;
+	long[] baseCounts=new long[5];
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Static Setters        ----------------*/
+	/*--------------------------------------------------------------*/
+
+	public synchronized void setStatics(){
+//		assert(!setStatics);
+		kInnerCDS=statsCDS.inner.k;
+		kStartCDS=statsCDS.start.k;
+		kStopCDS=statsCDS.stop.k;
+		
+		setStartLeftOffset(statsCDS.start.leftOffset);
+		setStartRightOffset(statsCDS.start.rightOffset());
+		
+		setStopLeftOffset(statsCDS.stop.leftOffset);
+		setStopRightOffset(statsCDS.stop.rightOffset());
+		
+		kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here?
+		kStartRNA=stats16S.start.k;
+		kStopRNA=stats16S.stop.k;
+		setStatics=true;
+	}
+	
+	public static void setInnerK(int k){
+		kInnerCDS=k;
+	}
+	
+	public static void setStartK(int k){
+		kStartCDS=k;
+	}
+	
+	public static void setStopK(int k){
+		kStopCDS=k;
+	}
+	
+	public static void setStartLeftOffset(int x){
+		startLeftOffset=x;
+		startFrames=startLeftOffset+startRightOffset+1;
+//		System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
+	}
+	
+	public static void setStartRightOffset(int x){
+		startRightOffset=x;
+		startFrames=startLeftOffset+startRightOffset+1;
+//		System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
+//		assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
+	}
+	
+	public static void setStopLeftOffset(int x){
+		stopLeftOffset=x;
+		stopFrames=stopLeftOffset+stopRightOffset+1;
+//		System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
+	}
+	
+	public static void setStopRightOffset(int x){
+		stopRightOffset=x;
+		stopFrames=stopLeftOffset+stopRightOffset+1;
+//		System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
+//		assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
+	}
+	
+	public static final boolean isStartCodon(int code){
+		return code>=0 && code<=63 && isStartCodon[code];
+	}
+	public static final boolean isStopCodon(int code){
+		return code>=0 && code<=63 && isStopCodon[code];
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------          Class Init          ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private static boolean[] makeIsCodon(String[] codons){
+		boolean[] array=new boolean[64];
+		for(String s : codons){
+			int x=AminoAcid.toNumber(s);
+			array[x]=true;
+		}
+		return array;
+	}
+	
+	public static int kInnerCDS=6;
+	public static int kStartCDS=3;
+	public static int kStopCDS=3;
+	
+	static int startLeftOffset(){return startLeftOffset;}
+	static int startRightOffset(){return startRightOffset;}
+	static int startFrames(){return startFrames;}
+	
+	private static int startLeftOffset=21; //21 works well for k=4
+	private static int startRightOffset=8; //10 works well for k=4
+	private static int startFrames=startLeftOffset+startRightOffset+1;
+	
+	private static int stopLeftOffset=9;
+	private static int stopRightOffset=12;
+	private static int stopFrames=stopLeftOffset+stopRightOffset+1;
+	
+	private static boolean setStatics=false;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         More Statics         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	//E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9]
+	public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"};
+	public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"};
+	public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"};
+	public static boolean[] isStartCodon=makeIsCodon(startCodons);
+	public static boolean[] isStopCodon=makeIsCodon(stopCodons);
+	
+	/*--------------------------------------------------------------*/
+	
+	private static PrintStream outstream=System.err;
+	public static boolean verbose=false;
+	public static boolean errorState=false;
+	
+}