Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FrameStats.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package prok; | |
2 | |
3 import java.util.Arrays; | |
4 | |
5 import dna.AminoAcid; | |
6 import shared.KillSwitch; | |
7 import shared.Parse; | |
8 import shared.Tools; | |
9 import structures.ByteBuilder; | |
10 | |
11 /** | |
12 * Stores frame-relative kmer counts for a type of genomic feature, such as a coding start site. | |
13 * @author Brian Bushnell | |
14 * @date Sep 24, 2018 | |
15 */ | |
16 public class FrameStats { | |
17 | |
18 /*--------------------------------------------------------------*/ | |
19 /*---------------- Initialization ----------------*/ | |
20 /*--------------------------------------------------------------*/ | |
21 | |
22 public FrameStats(String name_, int k_, int frames_, int leftOffset_){ | |
23 name=name_; | |
24 k=k_; | |
25 mask=~((-1)<<(2*k)); | |
26 frames=frames_; | |
27 kMax=1<<(2*k); | |
28 invFrames=1.0f/frames; | |
29 leftOffset=leftOffset_; | |
30 | |
31 probs=new float[frames][kMax]; | |
32 countsTrue=new long[frames][kMax]; | |
33 countsFalse=new long[frames][kMax]; | |
34 counts=new long[][][] {countsFalse, countsTrue}; | |
35 } | |
36 | |
37 /*--------------------------------------------------------------*/ | |
38 /*---------------- Methods ----------------*/ | |
39 /*--------------------------------------------------------------*/ | |
40 | |
41 public void add(int kmer, int frame, int valid){ | |
42 counts[valid][frame][kmer]++; | |
43 validSums[valid]++; | |
44 } | |
45 | |
46 public boolean compatibleWith(FrameStats fs) { | |
47 return fs.name.equals(name) && fs.leftOffset==leftOffset && fs.k==k && fs.frames==frames; | |
48 } | |
49 | |
50 public void clear() { | |
51 Tools.fill(counts, 0); | |
52 Arrays.fill(validSums, 0); | |
53 } | |
54 | |
55 public void setFrom(FrameStats fs) { | |
56 assert(compatibleWith(fs)) : name+", "+frames+", "+fs.name+", "+fs.frames; | |
57 clear(); | |
58 add(fs); | |
59 } | |
60 | |
61 public void add(FrameStats fs){ | |
62 assert(fs.name.equals(name)); | |
63 assert(fs.leftOffset==leftOffset); | |
64 assert(fs.k==k); | |
65 assert(fs.frames==frames) : name+", "+frames+", "+fs.name+", "+fs.frames; | |
66 // for(int x=0; x<counts.length; x++) { | |
67 // for(int y=0; y<counts[x].length; y++) { | |
68 // for(int z=0; z<counts[x][y].length; z++) { | |
69 // assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]+", "+fs.name; | |
70 // assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]; | |
71 // } | |
72 // } | |
73 // } | |
74 | |
75 Tools.add(counts, fs.counts); | |
76 Tools.add(validSums, fs.validSums); | |
77 // for(int x=0; x<counts.length; x++) { | |
78 // for(int y=0; y<counts[x].length; y++) { | |
79 // for(int z=0; z<counts[x][y].length; z++) { | |
80 // assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]; | |
81 // assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]; | |
82 // } | |
83 // } | |
84 // } | |
85 // calculate(); | |
86 } | |
87 | |
88 public void multiplyBy(double mult) { | |
89 Tools.multiplyBy(counts, mult); | |
90 Tools.multiplyBy(validSums, mult); | |
91 } | |
92 | |
93 void calculate(){ | |
94 average=(float)((validSums[1]+1.0)/(validSums[0]+validSums[1]+1.0)); | |
95 invAvg=1.0f/average; | |
96 | |
97 for(int a=0; a<frames; a++){ | |
98 for(int b=0; b<kMax; b++){ | |
99 long t=countsTrue[a][b]; | |
100 long f=countsFalse[a][b]; | |
101 probs[a][b]=(float)(t/(t+f+1.0))*invAvg; | |
102 } | |
103 } | |
104 } | |
105 | |
106 public float scorePoint(int point, byte[] bases){ | |
107 final int mask=~((-1)<<(2*k)); | |
108 | |
109 int kmer=0; | |
110 int len=0; | |
111 float score=0; | |
112 | |
113 // outstream.println("k="+k); | |
114 // outstream.println("mask="+mask); | |
115 | |
116 int start=point-leftOffset; | |
117 for(int i=start, frame=0-k+1; i<bases.length && frame<frames; i++, frame++){ | |
118 byte b=(i>=0 ? bases[i] : (byte)'A'); | |
119 int x=AminoAcid.baseToNumber[b]; | |
120 kmer=((kmer<<2)|x)&mask; | |
121 | |
122 // outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame); | |
123 | |
124 if(x>=0){ | |
125 len++; | |
126 if(len>=k){ | |
127 float prob=probs[frame][kmer]; | |
128 float dif=prob-0.99f; | |
129 score+=dif; | |
130 | |
131 // if(name.equals("startStats")){ | |
132 // System.err.println("frame="+frame+" kmer="+AminoAcid.kmerToString(kmer, k)+ | |
133 // String.format(Locale.ROOT, " prob=%.4f\tdif=%.4f\tscore=%.4f", prob, dif, score)+ | |
134 // "\tvalid="+counts[1][frame][kmer]+"\tinvalid="+counts[0][frame][kmer]); | |
135 // } | |
136 } | |
137 }else{len=0;} | |
138 } | |
139 | |
140 return score*invFrames; | |
141 } | |
142 | |
143 void processCDSFrames(byte[] bases, byte[] validFrames){ | |
144 if(!ProkObject.callCDS){return;} | |
145 int kmer=0; | |
146 int len=0; | |
147 | |
148 for(int i=0; i<bases.length; i++){ | |
149 byte b=bases[i]; | |
150 int x=AminoAcid.baseToNumber[b]; | |
151 kmer=((kmer<<2)|x)&mask; | |
152 if(x>=0){ | |
153 len++; | |
154 if(len>=k){ | |
155 int vf=validFrames[i]; | |
156 for(int frame=0; frame<frames; frame++){ | |
157 int valid=vf&1; | |
158 add(kmer, frame, valid); | |
159 //For CDS start (0-based) of 189, i=192, j=189, vf=1, frame=0 - all as expected. | |
160 // assert(valid==0) : "vf="+vf+", frame="+frame+", len="+len+", kmer="+AminoAcid.kmerToString(kmer, k)+", i="+i+", j="+j; | |
161 vf=(vf>>1); | |
162 } | |
163 } | |
164 }else{len=0;} | |
165 } | |
166 } | |
167 | |
168 void processPoint(byte[] bases, int point, int valid){ | |
169 | |
170 //Degenerate cases where the point is at the end, possibly from a truncated gene. | |
171 if(point<3){return;} | |
172 if(point>=bases.length-3){return;} | |
173 | |
174 int kmer=0; | |
175 int len=0; | |
176 | |
177 // outstream.println("k="+k); | |
178 // outstream.println("mask="+mask); | |
179 | |
180 int start=point-leftOffset; | |
181 | |
182 int i=start, frame=0-k+1; | |
183 while(i<0){i++; frame++;} | |
184 for(; i<bases.length && frame<frames; i++, frame++){ | |
185 byte b=bases[i]; | |
186 int x=AminoAcid.baseToNumber[b]; | |
187 kmer=((kmer<<2)|x)&mask; | |
188 | |
189 // outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame); | |
190 | |
191 if(x>=0){ | |
192 len++; | |
193 if(len>=k){ | |
194 add(kmer, frame, valid); | |
195 } | |
196 }else{len=0;} | |
197 } | |
198 } | |
199 | |
200 /*--------------------------------------------------------------*/ | |
201 /*---------------- Text Methods ----------------*/ | |
202 /*--------------------------------------------------------------*/ | |
203 | |
204 public void parseData(byte[] line) { | |
205 int a=0, b=0; | |
206 final int valid, frame; | |
207 | |
208 while(b<line.length && line[b]!='\t'){b++;} | |
209 assert(b>a) : "Missing field 0: "+new String(line); | |
210 valid=Parse.parseInt(line, a, b); | |
211 b++; | |
212 a=b; | |
213 | |
214 while(b<line.length && line[b]!='\t'){b++;} | |
215 assert(b>a) : "Missing field 1: "+new String(line); | |
216 frame=Parse.parseInt(line, a, b); | |
217 b++; | |
218 a=b; | |
219 | |
220 assert(valid==0 || valid==1); | |
221 assert(frame>=0 && frame<frames); | |
222 try { | |
223 final long[] row=counts[valid][frame]; | |
224 long sum=0; | |
225 for(int kmer=0; kmer<row.length; kmer++){ | |
226 while(b<line.length && line[b]!='\t'){b++;} | |
227 assert(b>a) : "Missing field 1: "+new String(line); | |
228 long count=Parse.parseLong(line, a, b); | |
229 b++; | |
230 a=b; | |
231 row[kmer]=count; | |
232 sum+=count; | |
233 } | |
234 validSums[valid]+=sum; | |
235 } catch (Exception e) { | |
236 System.err.println(new String(line)+"\n"+name); | |
237 assert(false) : e; | |
238 } | |
239 } | |
240 | |
241 @Override | |
242 public String toString(){ | |
243 return appendTo(new ByteBuilder()).toString(); | |
244 } | |
245 | |
246 public ByteBuilder appendTo(ByteBuilder bb){ | |
247 bb.append("#name\t").append(name).nl(); | |
248 bb.append("#k\t").append(k).nl(); | |
249 bb.append("#frames\t").append(frames).nl(); | |
250 bb.append("#offset\t").append(leftOffset).nl(); | |
251 bb.append("#valid\tframe"); | |
252 for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));} | |
253 bb.nl(); | |
254 for(int a=0; a<2; a++){//valid | |
255 for(int b=0; b<frames; b++){ | |
256 bb.append(a); | |
257 bb.tab().append(b); | |
258 for(int c=0; c<kMax; c++){ | |
259 bb.tab().append(counts[a][b][c]); | |
260 } | |
261 bb.nl(); | |
262 } | |
263 } | |
264 return bb; | |
265 } | |
266 | |
267 public ByteBuilder append0(ByteBuilder bb){ | |
268 bb.append("#name\t").append(name).nl(); | |
269 bb.append("#k\t").append(k).nl(); | |
270 bb.append("#frames\t").append(frames).nl(); | |
271 bb.append("#offset\t").append(leftOffset).nl(); | |
272 bb.append("#valid\tframe"); | |
273 for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));} | |
274 bb.nl(); | |
275 for(int a=0; a<2; a++){//valid | |
276 for(int b=0; b<frames; b++){ | |
277 bb.append(a); | |
278 bb.tab().append(b); | |
279 for(int c=0; c<kMax; c++){ | |
280 bb.tab().append(0); | |
281 } | |
282 bb.nl(); | |
283 } | |
284 } | |
285 return bb; | |
286 } | |
287 | |
288 /*--------------------------------------------------------------*/ | |
289 /*---------------- Fields ----------------*/ | |
290 /*--------------------------------------------------------------*/ | |
291 | |
292 public final String name; | |
293 public final int k; | |
294 public final int mask; | |
295 public final int frames; | |
296 public final int kMax; | |
297 public final float invFrames; | |
298 public final int leftOffset; | |
299 public int rightOffset() {return frames-leftOffset-1;} | |
300 | |
301 public final float[][] probs; | |
302 public final long[][] countsTrue; | |
303 public final long[][] countsFalse; | |
304 public final long[][][] counts; | |
305 | |
306 public final long[] validSums=KillSwitch.allocLong1D(2); | |
307 private float average=-1; | |
308 private float invAvg=-1; | |
309 | |
310 } |