comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FrameStats.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package prok;
2
3 import java.util.Arrays;
4
5 import dna.AminoAcid;
6 import shared.KillSwitch;
7 import shared.Parse;
8 import shared.Tools;
9 import structures.ByteBuilder;
10
11 /**
12 * Stores frame-relative kmer counts for a type of genomic feature, such as a coding start site.
13 * @author Brian Bushnell
14 * @date Sep 24, 2018
15 */
16 public class FrameStats {
17
18 /*--------------------------------------------------------------*/
19 /*---------------- Initialization ----------------*/
20 /*--------------------------------------------------------------*/
21
22 public FrameStats(String name_, int k_, int frames_, int leftOffset_){
23 name=name_;
24 k=k_;
25 mask=~((-1)<<(2*k));
26 frames=frames_;
27 kMax=1<<(2*k);
28 invFrames=1.0f/frames;
29 leftOffset=leftOffset_;
30
31 probs=new float[frames][kMax];
32 countsTrue=new long[frames][kMax];
33 countsFalse=new long[frames][kMax];
34 counts=new long[][][] {countsFalse, countsTrue};
35 }
36
37 /*--------------------------------------------------------------*/
38 /*---------------- Methods ----------------*/
39 /*--------------------------------------------------------------*/
40
41 public void add(int kmer, int frame, int valid){
42 counts[valid][frame][kmer]++;
43 validSums[valid]++;
44 }
45
46 public boolean compatibleWith(FrameStats fs) {
47 return fs.name.equals(name) && fs.leftOffset==leftOffset && fs.k==k && fs.frames==frames;
48 }
49
50 public void clear() {
51 Tools.fill(counts, 0);
52 Arrays.fill(validSums, 0);
53 }
54
55 public void setFrom(FrameStats fs) {
56 assert(compatibleWith(fs)) : name+", "+frames+", "+fs.name+", "+fs.frames;
57 clear();
58 add(fs);
59 }
60
61 public void add(FrameStats fs){
62 assert(fs.name.equals(name));
63 assert(fs.leftOffset==leftOffset);
64 assert(fs.k==k);
65 assert(fs.frames==frames) : name+", "+frames+", "+fs.name+", "+fs.frames;
66 // for(int x=0; x<counts.length; x++) {
67 // for(int y=0; y<counts[x].length; y++) {
68 // for(int z=0; z<counts[x][y].length; z++) {
69 // assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]+", "+fs.name;
70 // assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
71 // }
72 // }
73 // }
74
75 Tools.add(counts, fs.counts);
76 Tools.add(validSums, fs.validSums);
77 // for(int x=0; x<counts.length; x++) {
78 // for(int y=0; y<counts[x].length; y++) {
79 // for(int z=0; z<counts[x][y].length; z++) {
80 // assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
81 // assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
82 // }
83 // }
84 // }
85 // calculate();
86 }
87
88 public void multiplyBy(double mult) {
89 Tools.multiplyBy(counts, mult);
90 Tools.multiplyBy(validSums, mult);
91 }
92
93 void calculate(){
94 average=(float)((validSums[1]+1.0)/(validSums[0]+validSums[1]+1.0));
95 invAvg=1.0f/average;
96
97 for(int a=0; a<frames; a++){
98 for(int b=0; b<kMax; b++){
99 long t=countsTrue[a][b];
100 long f=countsFalse[a][b];
101 probs[a][b]=(float)(t/(t+f+1.0))*invAvg;
102 }
103 }
104 }
105
106 public float scorePoint(int point, byte[] bases){
107 final int mask=~((-1)<<(2*k));
108
109 int kmer=0;
110 int len=0;
111 float score=0;
112
113 // outstream.println("k="+k);
114 // outstream.println("mask="+mask);
115
116 int start=point-leftOffset;
117 for(int i=start, frame=0-k+1; i<bases.length && frame<frames; i++, frame++){
118 byte b=(i>=0 ? bases[i] : (byte)'A');
119 int x=AminoAcid.baseToNumber[b];
120 kmer=((kmer<<2)|x)&mask;
121
122 // outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame);
123
124 if(x>=0){
125 len++;
126 if(len>=k){
127 float prob=probs[frame][kmer];
128 float dif=prob-0.99f;
129 score+=dif;
130
131 // if(name.equals("startStats")){
132 // System.err.println("frame="+frame+" kmer="+AminoAcid.kmerToString(kmer, k)+
133 // String.format(Locale.ROOT, " prob=%.4f\tdif=%.4f\tscore=%.4f", prob, dif, score)+
134 // "\tvalid="+counts[1][frame][kmer]+"\tinvalid="+counts[0][frame][kmer]);
135 // }
136 }
137 }else{len=0;}
138 }
139
140 return score*invFrames;
141 }
142
143 void processCDSFrames(byte[] bases, byte[] validFrames){
144 if(!ProkObject.callCDS){return;}
145 int kmer=0;
146 int len=0;
147
148 for(int i=0; i<bases.length; i++){
149 byte b=bases[i];
150 int x=AminoAcid.baseToNumber[b];
151 kmer=((kmer<<2)|x)&mask;
152 if(x>=0){
153 len++;
154 if(len>=k){
155 int vf=validFrames[i];
156 for(int frame=0; frame<frames; frame++){
157 int valid=vf&1;
158 add(kmer, frame, valid);
159 //For CDS start (0-based) of 189, i=192, j=189, vf=1, frame=0 - all as expected.
160 // assert(valid==0) : "vf="+vf+", frame="+frame+", len="+len+", kmer="+AminoAcid.kmerToString(kmer, k)+", i="+i+", j="+j;
161 vf=(vf>>1);
162 }
163 }
164 }else{len=0;}
165 }
166 }
167
168 void processPoint(byte[] bases, int point, int valid){
169
170 //Degenerate cases where the point is at the end, possibly from a truncated gene.
171 if(point<3){return;}
172 if(point>=bases.length-3){return;}
173
174 int kmer=0;
175 int len=0;
176
177 // outstream.println("k="+k);
178 // outstream.println("mask="+mask);
179
180 int start=point-leftOffset;
181
182 int i=start, frame=0-k+1;
183 while(i<0){i++; frame++;}
184 for(; i<bases.length && frame<frames; i++, frame++){
185 byte b=bases[i];
186 int x=AminoAcid.baseToNumber[b];
187 kmer=((kmer<<2)|x)&mask;
188
189 // outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame);
190
191 if(x>=0){
192 len++;
193 if(len>=k){
194 add(kmer, frame, valid);
195 }
196 }else{len=0;}
197 }
198 }
199
200 /*--------------------------------------------------------------*/
201 /*---------------- Text Methods ----------------*/
202 /*--------------------------------------------------------------*/
203
204 public void parseData(byte[] line) {
205 int a=0, b=0;
206 final int valid, frame;
207
208 while(b<line.length && line[b]!='\t'){b++;}
209 assert(b>a) : "Missing field 0: "+new String(line);
210 valid=Parse.parseInt(line, a, b);
211 b++;
212 a=b;
213
214 while(b<line.length && line[b]!='\t'){b++;}
215 assert(b>a) : "Missing field 1: "+new String(line);
216 frame=Parse.parseInt(line, a, b);
217 b++;
218 a=b;
219
220 assert(valid==0 || valid==1);
221 assert(frame>=0 && frame<frames);
222 try {
223 final long[] row=counts[valid][frame];
224 long sum=0;
225 for(int kmer=0; kmer<row.length; kmer++){
226 while(b<line.length && line[b]!='\t'){b++;}
227 assert(b>a) : "Missing field 1: "+new String(line);
228 long count=Parse.parseLong(line, a, b);
229 b++;
230 a=b;
231 row[kmer]=count;
232 sum+=count;
233 }
234 validSums[valid]+=sum;
235 } catch (Exception e) {
236 System.err.println(new String(line)+"\n"+name);
237 assert(false) : e;
238 }
239 }
240
241 @Override
242 public String toString(){
243 return appendTo(new ByteBuilder()).toString();
244 }
245
246 public ByteBuilder appendTo(ByteBuilder bb){
247 bb.append("#name\t").append(name).nl();
248 bb.append("#k\t").append(k).nl();
249 bb.append("#frames\t").append(frames).nl();
250 bb.append("#offset\t").append(leftOffset).nl();
251 bb.append("#valid\tframe");
252 for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));}
253 bb.nl();
254 for(int a=0; a<2; a++){//valid
255 for(int b=0; b<frames; b++){
256 bb.append(a);
257 bb.tab().append(b);
258 for(int c=0; c<kMax; c++){
259 bb.tab().append(counts[a][b][c]);
260 }
261 bb.nl();
262 }
263 }
264 return bb;
265 }
266
267 public ByteBuilder append0(ByteBuilder bb){
268 bb.append("#name\t").append(name).nl();
269 bb.append("#k\t").append(k).nl();
270 bb.append("#frames\t").append(frames).nl();
271 bb.append("#offset\t").append(leftOffset).nl();
272 bb.append("#valid\tframe");
273 for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));}
274 bb.nl();
275 for(int a=0; a<2; a++){//valid
276 for(int b=0; b<frames; b++){
277 bb.append(a);
278 bb.tab().append(b);
279 for(int c=0; c<kMax; c++){
280 bb.tab().append(0);
281 }
282 bb.nl();
283 }
284 }
285 return bb;
286 }
287
288 /*--------------------------------------------------------------*/
289 /*---------------- Fields ----------------*/
290 /*--------------------------------------------------------------*/
291
292 public final String name;
293 public final int k;
294 public final int mask;
295 public final int frames;
296 public final int kMax;
297 public final float invFrames;
298 public final int leftOffset;
299 public int rightOffset() {return frames-leftOffset-1;}
300
301 public final float[][] probs;
302 public final long[][] countsTrue;
303 public final long[][] countsFalse;
304 public final long[][][] counts;
305
306 public final long[] validSums=KillSwitch.allocLong1D(2);
307 private float average=-1;
308 private float invAvg=-1;
309
310 }