Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FrameStats.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FrameStats.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,310 @@ +package prok; + +import java.util.Arrays; + +import dna.AminoAcid; +import shared.KillSwitch; +import shared.Parse; +import shared.Tools; +import structures.ByteBuilder; + +/** + * Stores frame-relative kmer counts for a type of genomic feature, such as a coding start site. + * @author Brian Bushnell + * @date Sep 24, 2018 + */ +public class FrameStats { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + public FrameStats(String name_, int k_, int frames_, int leftOffset_){ + name=name_; + k=k_; + mask=~((-1)<<(2*k)); + frames=frames_; + kMax=1<<(2*k); + invFrames=1.0f/frames; + leftOffset=leftOffset_; + + probs=new float[frames][kMax]; + countsTrue=new long[frames][kMax]; + countsFalse=new long[frames][kMax]; + counts=new long[][][] {countsFalse, countsTrue}; + } + + /*--------------------------------------------------------------*/ + /*---------------- Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public void add(int kmer, int frame, int valid){ + counts[valid][frame][kmer]++; + validSums[valid]++; + } + + public boolean compatibleWith(FrameStats fs) { + return fs.name.equals(name) && fs.leftOffset==leftOffset && fs.k==k && fs.frames==frames; + } + + public void clear() { + Tools.fill(counts, 0); + Arrays.fill(validSums, 0); + } + + public void setFrom(FrameStats fs) { + assert(compatibleWith(fs)) : name+", "+frames+", "+fs.name+", "+fs.frames; + clear(); + add(fs); + } + + public void add(FrameStats fs){ + assert(fs.name.equals(name)); + assert(fs.leftOffset==leftOffset); + assert(fs.k==k); + assert(fs.frames==frames) : name+", "+frames+", "+fs.name+", "+fs.frames; +// for(int x=0; x<counts.length; x++) { +// for(int y=0; y<counts[x].length; y++) { +// for(int z=0; z<counts[x][y].length; z++) { +// assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]+", "+fs.name; +// assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]; +// } +// } +// } + + Tools.add(counts, fs.counts); + Tools.add(validSums, fs.validSums); +// for(int x=0; x<counts.length; x++) { +// for(int y=0; y<counts[x].length; y++) { +// for(int z=0; z<counts[x][y].length; z++) { +// assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]; +// assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]; +// } +// } +// } +// calculate(); + } + + public void multiplyBy(double mult) { + Tools.multiplyBy(counts, mult); + Tools.multiplyBy(validSums, mult); + } + + void calculate(){ + average=(float)((validSums[1]+1.0)/(validSums[0]+validSums[1]+1.0)); + invAvg=1.0f/average; + + for(int a=0; a<frames; a++){ + for(int b=0; b<kMax; b++){ + long t=countsTrue[a][b]; + long f=countsFalse[a][b]; + probs[a][b]=(float)(t/(t+f+1.0))*invAvg; + } + } + } + + public float scorePoint(int point, byte[] bases){ + final int mask=~((-1)<<(2*k)); + + int kmer=0; + int len=0; + float score=0; + +// outstream.println("k="+k); +// outstream.println("mask="+mask); + + int start=point-leftOffset; + for(int i=start, frame=0-k+1; i<bases.length && frame<frames; i++, frame++){ + byte b=(i>=0 ? bases[i] : (byte)'A'); + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + +// outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame); + + if(x>=0){ + len++; + if(len>=k){ + float prob=probs[frame][kmer]; + float dif=prob-0.99f; + score+=dif; + +// if(name.equals("startStats")){ +// System.err.println("frame="+frame+" kmer="+AminoAcid.kmerToString(kmer, k)+ +// String.format(Locale.ROOT, " prob=%.4f\tdif=%.4f\tscore=%.4f", prob, dif, score)+ +// "\tvalid="+counts[1][frame][kmer]+"\tinvalid="+counts[0][frame][kmer]); +// } + } + }else{len=0;} + } + + return score*invFrames; + } + + void processCDSFrames(byte[] bases, byte[] validFrames){ + if(!ProkObject.callCDS){return;} + int kmer=0; + int len=0; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + if(x>=0){ + len++; + if(len>=k){ + int vf=validFrames[i]; + for(int frame=0; frame<frames; frame++){ + int valid=vf&1; + add(kmer, frame, valid); + //For CDS start (0-based) of 189, i=192, j=189, vf=1, frame=0 - all as expected. +// assert(valid==0) : "vf="+vf+", frame="+frame+", len="+len+", kmer="+AminoAcid.kmerToString(kmer, k)+", i="+i+", j="+j; + vf=(vf>>1); + } + } + }else{len=0;} + } + } + + void processPoint(byte[] bases, int point, int valid){ + + //Degenerate cases where the point is at the end, possibly from a truncated gene. + if(point<3){return;} + if(point>=bases.length-3){return;} + + int kmer=0; + int len=0; + +// outstream.println("k="+k); +// outstream.println("mask="+mask); + + int start=point-leftOffset; + + int i=start, frame=0-k+1; + while(i<0){i++; frame++;} + for(; i<bases.length && frame<frames; i++, frame++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + +// outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame); + + if(x>=0){ + len++; + if(len>=k){ + add(kmer, frame, valid); + } + }else{len=0;} + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Text Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public void parseData(byte[] line) { + int a=0, b=0; + final int valid, frame; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 0: "+new String(line); + valid=Parse.parseInt(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 1: "+new String(line); + frame=Parse.parseInt(line, a, b); + b++; + a=b; + + assert(valid==0 || valid==1); + assert(frame>=0 && frame<frames); + try { + final long[] row=counts[valid][frame]; + long sum=0; + for(int kmer=0; kmer<row.length; kmer++){ + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 1: "+new String(line); + long count=Parse.parseLong(line, a, b); + b++; + a=b; + row[kmer]=count; + sum+=count; + } + validSums[valid]+=sum; + } catch (Exception e) { + System.err.println(new String(line)+"\n"+name); + assert(false) : e; + } + } + + @Override + public String toString(){ + return appendTo(new ByteBuilder()).toString(); + } + + public ByteBuilder appendTo(ByteBuilder bb){ + bb.append("#name\t").append(name).nl(); + bb.append("#k\t").append(k).nl(); + bb.append("#frames\t").append(frames).nl(); + bb.append("#offset\t").append(leftOffset).nl(); + bb.append("#valid\tframe"); + for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));} + bb.nl(); + for(int a=0; a<2; a++){//valid + for(int b=0; b<frames; b++){ + bb.append(a); + bb.tab().append(b); + for(int c=0; c<kMax; c++){ + bb.tab().append(counts[a][b][c]); + } + bb.nl(); + } + } + return bb; + } + + public ByteBuilder append0(ByteBuilder bb){ + bb.append("#name\t").append(name).nl(); + bb.append("#k\t").append(k).nl(); + bb.append("#frames\t").append(frames).nl(); + bb.append("#offset\t").append(leftOffset).nl(); + bb.append("#valid\tframe"); + for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));} + bb.nl(); + for(int a=0; a<2; a++){//valid + for(int b=0; b<frames; b++){ + bb.append(a); + bb.tab().append(b); + for(int c=0; c<kMax; c++){ + bb.tab().append(0); + } + bb.nl(); + } + } + return bb; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + public final String name; + public final int k; + public final int mask; + public final int frames; + public final int kMax; + public final float invFrames; + public final int leftOffset; + public int rightOffset() {return frames-leftOffset-1;} + + public final float[][] probs; + public final long[][] countsTrue; + public final long[][] countsFalse; + public final long[][][] counts; + + public final long[] validSums=KillSwitch.allocLong1D(2); + private float average=-1; + private float invAvg=-1; + +}