view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FrameStats.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package prok;

import java.util.Arrays;

import dna.AminoAcid;
import shared.KillSwitch;
import shared.Parse;
import shared.Tools;
import structures.ByteBuilder;

/**
 * Stores frame-relative kmer counts for a type of genomic feature, such as a coding start site.
 * @author Brian Bushnell
 * @date Sep 24, 2018
 */
public class FrameStats {
	
	/*--------------------------------------------------------------*/
	/*----------------        Initialization        ----------------*/
	/*--------------------------------------------------------------*/
	
	public FrameStats(String name_, int k_, int frames_, int leftOffset_){
		name=name_;
		k=k_;
		mask=~((-1)<<(2*k));
		frames=frames_;
		kMax=1<<(2*k);
		invFrames=1.0f/frames;
		leftOffset=leftOffset_;
		
		probs=new float[frames][kMax];
		countsTrue=new long[frames][kMax];
		countsFalse=new long[frames][kMax];
		counts=new long[][][] {countsFalse, countsTrue};
	}
	
	/*--------------------------------------------------------------*/
	/*----------------           Methods            ----------------*/
	/*--------------------------------------------------------------*/
	
	public void add(int kmer, int frame, int valid){
		counts[valid][frame][kmer]++;
		validSums[valid]++;
	}
	
	public boolean compatibleWith(FrameStats fs) {
		return fs.name.equals(name) && fs.leftOffset==leftOffset && fs.k==k && fs.frames==frames;
	}
	
	public void clear() {
		Tools.fill(counts, 0);
		Arrays.fill(validSums, 0);
	}
	
	public void setFrom(FrameStats fs) {
		assert(compatibleWith(fs)) : name+", "+frames+", "+fs.name+", "+fs.frames;
		clear();
		add(fs);
	}
	
	public void add(FrameStats fs){
		assert(fs.name.equals(name));
		assert(fs.leftOffset==leftOffset);
		assert(fs.k==k);
		assert(fs.frames==frames) : name+", "+frames+", "+fs.name+", "+fs.frames;
//		for(int x=0; x<counts.length; x++) {
//			for(int y=0; y<counts[x].length; y++) {
//				for(int z=0; z<counts[x][y].length; z++) {
//					assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]+", "+fs.name;
//					assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
//				}
//			}
//		}

		Tools.add(counts, fs.counts);
		Tools.add(validSums, fs.validSums);
//		for(int x=0; x<counts.length; x++) {
//			for(int y=0; y<counts[x].length; y++) {
//				for(int z=0; z<counts[x][y].length; z++) {
//					assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
//					assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
//				}
//			}
//		}
//		calculate();
	}
	
	public void multiplyBy(double mult) {
		Tools.multiplyBy(counts, mult);
		Tools.multiplyBy(validSums, mult);
	}
	
	void calculate(){
		average=(float)((validSums[1]+1.0)/(validSums[0]+validSums[1]+1.0));
		invAvg=1.0f/average;
		
		for(int a=0; a<frames; a++){
			for(int b=0; b<kMax; b++){
				long t=countsTrue[a][b];
				long f=countsFalse[a][b];
				probs[a][b]=(float)(t/(t+f+1.0))*invAvg;
			}
		}
	}
	
	public float scorePoint(int point, byte[] bases){
		final int mask=~((-1)<<(2*k));
		
		int kmer=0;
		int len=0;
		float score=0;

//		outstream.println("k="+k);
//		outstream.println("mask="+mask);
		
		int start=point-leftOffset;
		for(int i=start, frame=0-k+1; i<bases.length && frame<frames; i++, frame++){
			byte b=(i>=0 ? bases[i] : (byte)'A');
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			
//			outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame);
			
			if(x>=0){
				len++;
				if(len>=k){
					float prob=probs[frame][kmer];
					float dif=prob-0.99f;
					score+=dif;
					
//					if(name.equals("startStats")){
//						System.err.println("frame="+frame+" kmer="+AminoAcid.kmerToString(kmer, k)+
//								String.format(Locale.ROOT, " prob=%.4f\tdif=%.4f\tscore=%.4f", prob, dif, score)+
//								"\tvalid="+counts[1][frame][kmer]+"\tinvalid="+counts[0][frame][kmer]);
//					}
				}
			}else{len=0;}
		}
		
		return score*invFrames;
	}
	
	void processCDSFrames(byte[] bases, byte[] validFrames){
		if(!ProkObject.callCDS){return;}
		int kmer=0;
		int len=0;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			if(x>=0){
				len++;
				if(len>=k){
					int vf=validFrames[i];
					for(int frame=0; frame<frames; frame++){
						int valid=vf&1;
						add(kmer, frame, valid);
						//For CDS start (0-based) of 189, i=192, j=189, vf=1, frame=0 - all as expected.
//						assert(valid==0) : "vf="+vf+", frame="+frame+", len="+len+", kmer="+AminoAcid.kmerToString(kmer, k)+", i="+i+", j="+j;
						vf=(vf>>1);
					}
				}
			}else{len=0;}
		}
	}
	
	void processPoint(byte[] bases, int point, int valid){
		
		//Degenerate cases where the point is at the end, possibly from a truncated gene.
		if(point<3){return;}
		if(point>=bases.length-3){return;}
		
		int kmer=0;
		int len=0;

//		outstream.println("k="+k);
//		outstream.println("mask="+mask);
		
		int start=point-leftOffset;
		
		int i=start, frame=0-k+1;
		while(i<0){i++; frame++;}
		for(; i<bases.length && frame<frames; i++, frame++){
			byte b=bases[i];
			int x=AminoAcid.baseToNumber[b];
			kmer=((kmer<<2)|x)&mask;
			
//			outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame);
			
			if(x>=0){
				len++;
				if(len>=k){
					add(kmer, frame, valid);
				}
			}else{len=0;}
		}
	}
	
	/*--------------------------------------------------------------*/
	/*----------------         Text Methods         ----------------*/
	/*--------------------------------------------------------------*/

	public void parseData(byte[] line) {
		int a=0, b=0;
		final int valid, frame;
		
		while(b<line.length && line[b]!='\t'){b++;}
		assert(b>a) : "Missing field 0: "+new String(line);
		valid=Parse.parseInt(line, a, b);
		b++;
		a=b;
		
		while(b<line.length && line[b]!='\t'){b++;}
		assert(b>a) : "Missing field 1: "+new String(line);
		frame=Parse.parseInt(line, a, b);
		b++;
		a=b;
		
		assert(valid==0 || valid==1);
		assert(frame>=0 && frame<frames);
		try {
			final long[] row=counts[valid][frame];
			long sum=0;
			for(int kmer=0; kmer<row.length; kmer++){
				while(b<line.length && line[b]!='\t'){b++;}
				assert(b>a) : "Missing field 1: "+new String(line);
				long count=Parse.parseLong(line, a, b);
				b++;
				a=b;
				row[kmer]=count;
				sum+=count;
			}
			validSums[valid]+=sum;
		} catch (Exception e) {
			System.err.println(new String(line)+"\n"+name);
			assert(false) : e;
		}
	}
	
	@Override
	public String toString(){
		return appendTo(new ByteBuilder()).toString();
	}
	
	public ByteBuilder appendTo(ByteBuilder bb){
		bb.append("#name\t").append(name).nl();
		bb.append("#k\t").append(k).nl();
		bb.append("#frames\t").append(frames).nl();
		bb.append("#offset\t").append(leftOffset).nl();
		bb.append("#valid\tframe");
		for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));}
		bb.nl();
		for(int a=0; a<2; a++){//valid
			for(int b=0; b<frames; b++){
				bb.append(a);
				bb.tab().append(b);
				for(int c=0; c<kMax; c++){
					bb.tab().append(counts[a][b][c]);
				}
				bb.nl();
			}
		}
		return bb;
	}
	
	public ByteBuilder append0(ByteBuilder bb){
		bb.append("#name\t").append(name).nl();
		bb.append("#k\t").append(k).nl();
		bb.append("#frames\t").append(frames).nl();
		bb.append("#offset\t").append(leftOffset).nl();
		bb.append("#valid\tframe");
		for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));}
		bb.nl();
		for(int a=0; a<2; a++){//valid
			for(int b=0; b<frames; b++){
				bb.append(a);
				bb.tab().append(b);
				for(int c=0; c<kMax; c++){
					bb.tab().append(0);
				}
				bb.nl();
			}
		}
		return bb;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------            Fields            ----------------*/
	/*--------------------------------------------------------------*/

	public final String name;
	public final int k;
	public final int mask;
	public final int frames;
	public final int kMax;
	public final float invFrames;
	public final int leftOffset;
	public int rightOffset() {return frames-leftOffset-1;}
	
	public final float[][] probs;
	public final long[][] countsTrue;
	public final long[][] countsFalse;
	public final long[][][] counts;

	public final long[] validSums=KillSwitch.allocLong1D(2);
	private float average=-1;
	private float invAvg=-1;
	
}