jpayne@68
|
1 package prok;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.util.Arrays;
|
jpayne@68
|
4
|
jpayne@68
|
5 import dna.AminoAcid;
|
jpayne@68
|
6 import shared.KillSwitch;
|
jpayne@68
|
7 import shared.Parse;
|
jpayne@68
|
8 import shared.Tools;
|
jpayne@68
|
9 import structures.ByteBuilder;
|
jpayne@68
|
10
|
jpayne@68
|
11 /**
|
jpayne@68
|
12 * Stores frame-relative kmer counts for a type of genomic feature, such as a coding start site.
|
jpayne@68
|
13 * @author Brian Bushnell
|
jpayne@68
|
14 * @date Sep 24, 2018
|
jpayne@68
|
15 */
|
jpayne@68
|
16 public class FrameStats {
|
jpayne@68
|
17
|
jpayne@68
|
18 /*--------------------------------------------------------------*/
|
jpayne@68
|
19 /*---------------- Initialization ----------------*/
|
jpayne@68
|
20 /*--------------------------------------------------------------*/
|
jpayne@68
|
21
|
jpayne@68
|
22 public FrameStats(String name_, int k_, int frames_, int leftOffset_){
|
jpayne@68
|
23 name=name_;
|
jpayne@68
|
24 k=k_;
|
jpayne@68
|
25 mask=~((-1)<<(2*k));
|
jpayne@68
|
26 frames=frames_;
|
jpayne@68
|
27 kMax=1<<(2*k);
|
jpayne@68
|
28 invFrames=1.0f/frames;
|
jpayne@68
|
29 leftOffset=leftOffset_;
|
jpayne@68
|
30
|
jpayne@68
|
31 probs=new float[frames][kMax];
|
jpayne@68
|
32 countsTrue=new long[frames][kMax];
|
jpayne@68
|
33 countsFalse=new long[frames][kMax];
|
jpayne@68
|
34 counts=new long[][][] {countsFalse, countsTrue};
|
jpayne@68
|
35 }
|
jpayne@68
|
36
|
jpayne@68
|
37 /*--------------------------------------------------------------*/
|
jpayne@68
|
38 /*---------------- Methods ----------------*/
|
jpayne@68
|
39 /*--------------------------------------------------------------*/
|
jpayne@68
|
40
|
jpayne@68
|
41 public void add(int kmer, int frame, int valid){
|
jpayne@68
|
42 counts[valid][frame][kmer]++;
|
jpayne@68
|
43 validSums[valid]++;
|
jpayne@68
|
44 }
|
jpayne@68
|
45
|
jpayne@68
|
46 public boolean compatibleWith(FrameStats fs) {
|
jpayne@68
|
47 return fs.name.equals(name) && fs.leftOffset==leftOffset && fs.k==k && fs.frames==frames;
|
jpayne@68
|
48 }
|
jpayne@68
|
49
|
jpayne@68
|
50 public void clear() {
|
jpayne@68
|
51 Tools.fill(counts, 0);
|
jpayne@68
|
52 Arrays.fill(validSums, 0);
|
jpayne@68
|
53 }
|
jpayne@68
|
54
|
jpayne@68
|
55 public void setFrom(FrameStats fs) {
|
jpayne@68
|
56 assert(compatibleWith(fs)) : name+", "+frames+", "+fs.name+", "+fs.frames;
|
jpayne@68
|
57 clear();
|
jpayne@68
|
58 add(fs);
|
jpayne@68
|
59 }
|
jpayne@68
|
60
|
jpayne@68
|
61 public void add(FrameStats fs){
|
jpayne@68
|
62 assert(fs.name.equals(name));
|
jpayne@68
|
63 assert(fs.leftOffset==leftOffset);
|
jpayne@68
|
64 assert(fs.k==k);
|
jpayne@68
|
65 assert(fs.frames==frames) : name+", "+frames+", "+fs.name+", "+fs.frames;
|
jpayne@68
|
66 // for(int x=0; x<counts.length; x++) {
|
jpayne@68
|
67 // for(int y=0; y<counts[x].length; y++) {
|
jpayne@68
|
68 // for(int z=0; z<counts[x][y].length; z++) {
|
jpayne@68
|
69 // assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z]+", "+fs.name;
|
jpayne@68
|
70 // assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
|
jpayne@68
|
71 // }
|
jpayne@68
|
72 // }
|
jpayne@68
|
73 // }
|
jpayne@68
|
74
|
jpayne@68
|
75 Tools.add(counts, fs.counts);
|
jpayne@68
|
76 Tools.add(validSums, fs.validSums);
|
jpayne@68
|
77 // for(int x=0; x<counts.length; x++) {
|
jpayne@68
|
78 // for(int y=0; y<counts[x].length; y++) {
|
jpayne@68
|
79 // for(int z=0; z<counts[x][y].length; z++) {
|
jpayne@68
|
80 // assert(fs.counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
|
jpayne@68
|
81 // assert(counts[x][y][z]>=0) : counts[x][y][z]+", "+fs.counts[x][y][z];
|
jpayne@68
|
82 // }
|
jpayne@68
|
83 // }
|
jpayne@68
|
84 // }
|
jpayne@68
|
85 // calculate();
|
jpayne@68
|
86 }
|
jpayne@68
|
87
|
jpayne@68
|
88 public void multiplyBy(double mult) {
|
jpayne@68
|
89 Tools.multiplyBy(counts, mult);
|
jpayne@68
|
90 Tools.multiplyBy(validSums, mult);
|
jpayne@68
|
91 }
|
jpayne@68
|
92
|
jpayne@68
|
93 void calculate(){
|
jpayne@68
|
94 average=(float)((validSums[1]+1.0)/(validSums[0]+validSums[1]+1.0));
|
jpayne@68
|
95 invAvg=1.0f/average;
|
jpayne@68
|
96
|
jpayne@68
|
97 for(int a=0; a<frames; a++){
|
jpayne@68
|
98 for(int b=0; b<kMax; b++){
|
jpayne@68
|
99 long t=countsTrue[a][b];
|
jpayne@68
|
100 long f=countsFalse[a][b];
|
jpayne@68
|
101 probs[a][b]=(float)(t/(t+f+1.0))*invAvg;
|
jpayne@68
|
102 }
|
jpayne@68
|
103 }
|
jpayne@68
|
104 }
|
jpayne@68
|
105
|
jpayne@68
|
106 public float scorePoint(int point, byte[] bases){
|
jpayne@68
|
107 final int mask=~((-1)<<(2*k));
|
jpayne@68
|
108
|
jpayne@68
|
109 int kmer=0;
|
jpayne@68
|
110 int len=0;
|
jpayne@68
|
111 float score=0;
|
jpayne@68
|
112
|
jpayne@68
|
113 // outstream.println("k="+k);
|
jpayne@68
|
114 // outstream.println("mask="+mask);
|
jpayne@68
|
115
|
jpayne@68
|
116 int start=point-leftOffset;
|
jpayne@68
|
117 for(int i=start, frame=0-k+1; i<bases.length && frame<frames; i++, frame++){
|
jpayne@68
|
118 byte b=(i>=0 ? bases[i] : (byte)'A');
|
jpayne@68
|
119 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
120 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
121
|
jpayne@68
|
122 // outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame);
|
jpayne@68
|
123
|
jpayne@68
|
124 if(x>=0){
|
jpayne@68
|
125 len++;
|
jpayne@68
|
126 if(len>=k){
|
jpayne@68
|
127 float prob=probs[frame][kmer];
|
jpayne@68
|
128 float dif=prob-0.99f;
|
jpayne@68
|
129 score+=dif;
|
jpayne@68
|
130
|
jpayne@68
|
131 // if(name.equals("startStats")){
|
jpayne@68
|
132 // System.err.println("frame="+frame+" kmer="+AminoAcid.kmerToString(kmer, k)+
|
jpayne@68
|
133 // String.format(Locale.ROOT, " prob=%.4f\tdif=%.4f\tscore=%.4f", prob, dif, score)+
|
jpayne@68
|
134 // "\tvalid="+counts[1][frame][kmer]+"\tinvalid="+counts[0][frame][kmer]);
|
jpayne@68
|
135 // }
|
jpayne@68
|
136 }
|
jpayne@68
|
137 }else{len=0;}
|
jpayne@68
|
138 }
|
jpayne@68
|
139
|
jpayne@68
|
140 return score*invFrames;
|
jpayne@68
|
141 }
|
jpayne@68
|
142
|
jpayne@68
|
143 void processCDSFrames(byte[] bases, byte[] validFrames){
|
jpayne@68
|
144 if(!ProkObject.callCDS){return;}
|
jpayne@68
|
145 int kmer=0;
|
jpayne@68
|
146 int len=0;
|
jpayne@68
|
147
|
jpayne@68
|
148 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
149 byte b=bases[i];
|
jpayne@68
|
150 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
151 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
152 if(x>=0){
|
jpayne@68
|
153 len++;
|
jpayne@68
|
154 if(len>=k){
|
jpayne@68
|
155 int vf=validFrames[i];
|
jpayne@68
|
156 for(int frame=0; frame<frames; frame++){
|
jpayne@68
|
157 int valid=vf&1;
|
jpayne@68
|
158 add(kmer, frame, valid);
|
jpayne@68
|
159 //For CDS start (0-based) of 189, i=192, j=189, vf=1, frame=0 - all as expected.
|
jpayne@68
|
160 // assert(valid==0) : "vf="+vf+", frame="+frame+", len="+len+", kmer="+AminoAcid.kmerToString(kmer, k)+", i="+i+", j="+j;
|
jpayne@68
|
161 vf=(vf>>1);
|
jpayne@68
|
162 }
|
jpayne@68
|
163 }
|
jpayne@68
|
164 }else{len=0;}
|
jpayne@68
|
165 }
|
jpayne@68
|
166 }
|
jpayne@68
|
167
|
jpayne@68
|
168 void processPoint(byte[] bases, int point, int valid){
|
jpayne@68
|
169
|
jpayne@68
|
170 //Degenerate cases where the point is at the end, possibly from a truncated gene.
|
jpayne@68
|
171 if(point<3){return;}
|
jpayne@68
|
172 if(point>=bases.length-3){return;}
|
jpayne@68
|
173
|
jpayne@68
|
174 int kmer=0;
|
jpayne@68
|
175 int len=0;
|
jpayne@68
|
176
|
jpayne@68
|
177 // outstream.println("k="+k);
|
jpayne@68
|
178 // outstream.println("mask="+mask);
|
jpayne@68
|
179
|
jpayne@68
|
180 int start=point-leftOffset;
|
jpayne@68
|
181
|
jpayne@68
|
182 int i=start, frame=0-k+1;
|
jpayne@68
|
183 while(i<0){i++; frame++;}
|
jpayne@68
|
184 for(; i<bases.length && frame<frames; i++, frame++){
|
jpayne@68
|
185 byte b=bases[i];
|
jpayne@68
|
186 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
187 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
188
|
jpayne@68
|
189 // outstream.println("b="+(char)b+", kmer="+kmer+", len="+(len+1)+", frame="+frame);
|
jpayne@68
|
190
|
jpayne@68
|
191 if(x>=0){
|
jpayne@68
|
192 len++;
|
jpayne@68
|
193 if(len>=k){
|
jpayne@68
|
194 add(kmer, frame, valid);
|
jpayne@68
|
195 }
|
jpayne@68
|
196 }else{len=0;}
|
jpayne@68
|
197 }
|
jpayne@68
|
198 }
|
jpayne@68
|
199
|
jpayne@68
|
200 /*--------------------------------------------------------------*/
|
jpayne@68
|
201 /*---------------- Text Methods ----------------*/
|
jpayne@68
|
202 /*--------------------------------------------------------------*/
|
jpayne@68
|
203
|
jpayne@68
|
204 public void parseData(byte[] line) {
|
jpayne@68
|
205 int a=0, b=0;
|
jpayne@68
|
206 final int valid, frame;
|
jpayne@68
|
207
|
jpayne@68
|
208 while(b<line.length && line[b]!='\t'){b++;}
|
jpayne@68
|
209 assert(b>a) : "Missing field 0: "+new String(line);
|
jpayne@68
|
210 valid=Parse.parseInt(line, a, b);
|
jpayne@68
|
211 b++;
|
jpayne@68
|
212 a=b;
|
jpayne@68
|
213
|
jpayne@68
|
214 while(b<line.length && line[b]!='\t'){b++;}
|
jpayne@68
|
215 assert(b>a) : "Missing field 1: "+new String(line);
|
jpayne@68
|
216 frame=Parse.parseInt(line, a, b);
|
jpayne@68
|
217 b++;
|
jpayne@68
|
218 a=b;
|
jpayne@68
|
219
|
jpayne@68
|
220 assert(valid==0 || valid==1);
|
jpayne@68
|
221 assert(frame>=0 && frame<frames);
|
jpayne@68
|
222 try {
|
jpayne@68
|
223 final long[] row=counts[valid][frame];
|
jpayne@68
|
224 long sum=0;
|
jpayne@68
|
225 for(int kmer=0; kmer<row.length; kmer++){
|
jpayne@68
|
226 while(b<line.length && line[b]!='\t'){b++;}
|
jpayne@68
|
227 assert(b>a) : "Missing field 1: "+new String(line);
|
jpayne@68
|
228 long count=Parse.parseLong(line, a, b);
|
jpayne@68
|
229 b++;
|
jpayne@68
|
230 a=b;
|
jpayne@68
|
231 row[kmer]=count;
|
jpayne@68
|
232 sum+=count;
|
jpayne@68
|
233 }
|
jpayne@68
|
234 validSums[valid]+=sum;
|
jpayne@68
|
235 } catch (Exception e) {
|
jpayne@68
|
236 System.err.println(new String(line)+"\n"+name);
|
jpayne@68
|
237 assert(false) : e;
|
jpayne@68
|
238 }
|
jpayne@68
|
239 }
|
jpayne@68
|
240
|
jpayne@68
|
241 @Override
|
jpayne@68
|
242 public String toString(){
|
jpayne@68
|
243 return appendTo(new ByteBuilder()).toString();
|
jpayne@68
|
244 }
|
jpayne@68
|
245
|
jpayne@68
|
246 public ByteBuilder appendTo(ByteBuilder bb){
|
jpayne@68
|
247 bb.append("#name\t").append(name).nl();
|
jpayne@68
|
248 bb.append("#k\t").append(k).nl();
|
jpayne@68
|
249 bb.append("#frames\t").append(frames).nl();
|
jpayne@68
|
250 bb.append("#offset\t").append(leftOffset).nl();
|
jpayne@68
|
251 bb.append("#valid\tframe");
|
jpayne@68
|
252 for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));}
|
jpayne@68
|
253 bb.nl();
|
jpayne@68
|
254 for(int a=0; a<2; a++){//valid
|
jpayne@68
|
255 for(int b=0; b<frames; b++){
|
jpayne@68
|
256 bb.append(a);
|
jpayne@68
|
257 bb.tab().append(b);
|
jpayne@68
|
258 for(int c=0; c<kMax; c++){
|
jpayne@68
|
259 bb.tab().append(counts[a][b][c]);
|
jpayne@68
|
260 }
|
jpayne@68
|
261 bb.nl();
|
jpayne@68
|
262 }
|
jpayne@68
|
263 }
|
jpayne@68
|
264 return bb;
|
jpayne@68
|
265 }
|
jpayne@68
|
266
|
jpayne@68
|
267 public ByteBuilder append0(ByteBuilder bb){
|
jpayne@68
|
268 bb.append("#name\t").append(name).nl();
|
jpayne@68
|
269 bb.append("#k\t").append(k).nl();
|
jpayne@68
|
270 bb.append("#frames\t").append(frames).nl();
|
jpayne@68
|
271 bb.append("#offset\t").append(leftOffset).nl();
|
jpayne@68
|
272 bb.append("#valid\tframe");
|
jpayne@68
|
273 for(int i=0; i<kMax; i++){bb.tab().append(AminoAcid.kmerToString(i, k));}
|
jpayne@68
|
274 bb.nl();
|
jpayne@68
|
275 for(int a=0; a<2; a++){//valid
|
jpayne@68
|
276 for(int b=0; b<frames; b++){
|
jpayne@68
|
277 bb.append(a);
|
jpayne@68
|
278 bb.tab().append(b);
|
jpayne@68
|
279 for(int c=0; c<kMax; c++){
|
jpayne@68
|
280 bb.tab().append(0);
|
jpayne@68
|
281 }
|
jpayne@68
|
282 bb.nl();
|
jpayne@68
|
283 }
|
jpayne@68
|
284 }
|
jpayne@68
|
285 return bb;
|
jpayne@68
|
286 }
|
jpayne@68
|
287
|
jpayne@68
|
288 /*--------------------------------------------------------------*/
|
jpayne@68
|
289 /*---------------- Fields ----------------*/
|
jpayne@68
|
290 /*--------------------------------------------------------------*/
|
jpayne@68
|
291
|
jpayne@68
|
292 public final String name;
|
jpayne@68
|
293 public final int k;
|
jpayne@68
|
294 public final int mask;
|
jpayne@68
|
295 public final int frames;
|
jpayne@68
|
296 public final int kMax;
|
jpayne@68
|
297 public final float invFrames;
|
jpayne@68
|
298 public final int leftOffset;
|
jpayne@68
|
299 public int rightOffset() {return frames-leftOffset-1;}
|
jpayne@68
|
300
|
jpayne@68
|
301 public final float[][] probs;
|
jpayne@68
|
302 public final long[][] countsTrue;
|
jpayne@68
|
303 public final long[][] countsFalse;
|
jpayne@68
|
304 public final long[][][] counts;
|
jpayne@68
|
305
|
jpayne@68
|
306 public final long[] validSums=KillSwitch.allocLong1D(2);
|
jpayne@68
|
307 private float average=-1;
|
jpayne@68
|
308 private float invAvg=-1;
|
jpayne@68
|
309
|
jpayne@68
|
310 }
|