jpayne@68
|
1 package prok;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.PrintStream;
|
jpayne@68
|
4 import java.util.ArrayList;
|
jpayne@68
|
5 import java.util.BitSet;
|
jpayne@68
|
6 import java.util.HashMap;
|
jpayne@68
|
7
|
jpayne@68
|
8 import aligner.SingleStateAlignerFlat2;
|
jpayne@68
|
9 import dna.AminoAcid;
|
jpayne@68
|
10 import fileIO.FileFormat;
|
jpayne@68
|
11 import gff.GffLine;
|
jpayne@68
|
12 import shared.KillSwitch;
|
jpayne@68
|
13 import shared.Shared;
|
jpayne@68
|
14 import shared.Tools;
|
jpayne@68
|
15 import stream.Read;
|
jpayne@68
|
16 import stream.ReadInputStream;
|
jpayne@68
|
17 import structures.ByteBuilder;
|
jpayne@68
|
18 import structures.IntList;
|
jpayne@68
|
19
|
jpayne@68
|
20 /**
|
jpayne@68
|
21 * This class is designed to store kmer frequencies related to gene
|
jpayne@68
|
22 * starts, stops, and interiors. It can be loaded from a pgm file.
|
jpayne@68
|
23 *
|
jpayne@68
|
24 * It's possible to use multiple GeneModels; for example, one for
|
jpayne@68
|
25 * each of several GC ranges or clades.
|
jpayne@68
|
26 * @author Brian Bushnell
|
jpayne@68
|
27 * @date Sep 24, 2018
|
jpayne@68
|
28 *
|
jpayne@68
|
29 */
|
jpayne@68
|
30 public class GeneModel extends ProkObject {
|
jpayne@68
|
31
|
jpayne@68
|
32 /*--------------------------------------------------------------*/
|
jpayne@68
|
33 /*---------------- Initialization ----------------*/
|
jpayne@68
|
34 /*--------------------------------------------------------------*/
|
jpayne@68
|
35
|
jpayne@68
|
36 public GeneModel(boolean fill){
|
jpayne@68
|
37 if(fill){
|
jpayne@68
|
38 fillContainers();
|
jpayne@68
|
39 }
|
jpayne@68
|
40 }
|
jpayne@68
|
41
|
jpayne@68
|
42 void fillContainers(){
|
jpayne@68
|
43 statsCDS.setInner(kInnerCDS, 3);
|
jpayne@68
|
44 statsCDS.setStart(kStartCDS, startFrames, startLeftOffset);
|
jpayne@68
|
45 statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset);
|
jpayne@68
|
46
|
jpayne@68
|
47 for(int i=0; i<rnaContainers.length; i++){
|
jpayne@68
|
48 StatsContainer sc=rnaContainers[i];
|
jpayne@68
|
49 sc.setInner(kInnerRNA, 1);
|
jpayne@68
|
50 }
|
jpayne@68
|
51
|
jpayne@68
|
52 statstRNA.setStart(kStartRNA, 14, 4);
|
jpayne@68
|
53 statstRNA.setStop(kStopRNA, 14, 6);
|
jpayne@68
|
54
|
jpayne@68
|
55 stats16S.setStart(kStartRNA, 20, 7);
|
jpayne@68
|
56 stats16S.setStop(kStopRNA, 12, 16);
|
jpayne@68
|
57
|
jpayne@68
|
58 stats23S.setStart(kStartRNA, 17, 3);
|
jpayne@68
|
59 stats23S.setStop(kStopRNA, 15, 12);
|
jpayne@68
|
60
|
jpayne@68
|
61 stats5S.setStart(kStartRNA, 20, 5);
|
jpayne@68
|
62 stats5S.setStop(kStopRNA, 15, 5);
|
jpayne@68
|
63
|
jpayne@68
|
64 stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined
|
jpayne@68
|
65 stats18S.setStop(kStopRNA, 12, 16);
|
jpayne@68
|
66 }
|
jpayne@68
|
67
|
jpayne@68
|
68 /*--------------------------------------------------------------*/
|
jpayne@68
|
69 /*---------------- Public Methods ----------------*/
|
jpayne@68
|
70 /*--------------------------------------------------------------*/
|
jpayne@68
|
71
|
jpayne@68
|
72 public boolean process(String genomeFname, String gffFname){
|
jpayne@68
|
73 // fnames.add(ReadWrite.stripPath(genomeFname));
|
jpayne@68
|
74 numFiles++;
|
jpayne@68
|
75 FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true);
|
jpayne@68
|
76 FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true);
|
jpayne@68
|
77
|
jpayne@68
|
78 if(fnaFile==null || gffFile==null){
|
jpayne@68
|
79 errorState=true;
|
jpayne@68
|
80 return true;
|
jpayne@68
|
81 }
|
jpayne@68
|
82 filesProcessed++;
|
jpayne@68
|
83
|
jpayne@68
|
84 ArrayList<ScafData> scafList;
|
jpayne@68
|
85 {//Scoped to save memory
|
jpayne@68
|
86 ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads);
|
jpayne@68
|
87 readsProcessed+=reads.size();
|
jpayne@68
|
88 scafList=new ArrayList<ScafData>(reads.size());
|
jpayne@68
|
89 for(Read r : reads){
|
jpayne@68
|
90 basesProcessed+=r.length();
|
jpayne@68
|
91 scafList.add(new ScafData(r));
|
jpayne@68
|
92 }
|
jpayne@68
|
93 }
|
jpayne@68
|
94 {//Scoped to save memory
|
jpayne@68
|
95 ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true);
|
jpayne@68
|
96 ArrayList<GffLine> cds=allGffLines[0];
|
jpayne@68
|
97 ArrayList<GffLine> rrna=allGffLines[1];
|
jpayne@68
|
98 ArrayList<GffLine> trna=allGffLines[2];
|
jpayne@68
|
99 genesProcessed+=cds.size();
|
jpayne@68
|
100 genesProcessed+=(rrna==null ? 0 : rrna.size());
|
jpayne@68
|
101 genesProcessed+=(trna==null ? 0 : trna.size());
|
jpayne@68
|
102
|
jpayne@68
|
103 HashMap<String, ScafData> scafMap=makeScafMap(scafList);
|
jpayne@68
|
104 fillScafDataCDS(cds, scafMap);
|
jpayne@68
|
105 fillScafDataRNA(rrna, scafMap);
|
jpayne@68
|
106 fillScafDataRNA(trna, scafMap);
|
jpayne@68
|
107 }
|
jpayne@68
|
108
|
jpayne@68
|
109 countBases(scafList);
|
jpayne@68
|
110 if(PROCESS_PLUS_STRAND){
|
jpayne@68
|
111 processStrand(scafList, Shared.PLUS);
|
jpayne@68
|
112 }
|
jpayne@68
|
113 if(PROCESS_MINUS_STRAND){
|
jpayne@68
|
114 for(ScafData sd : scafList){
|
jpayne@68
|
115 sd.clear();
|
jpayne@68
|
116 sd.reverseComplement();
|
jpayne@68
|
117 }
|
jpayne@68
|
118 processStrand(scafList, Shared.MINUS);
|
jpayne@68
|
119 for(ScafData sd : scafList){
|
jpayne@68
|
120 sd.clear();
|
jpayne@68
|
121 sd.reverseComplement();
|
jpayne@68
|
122 }
|
jpayne@68
|
123 }
|
jpayne@68
|
124 return false;
|
jpayne@68
|
125 }
|
jpayne@68
|
126
|
jpayne@68
|
127 public void add(GeneModel pgm){
|
jpayne@68
|
128 for(int i=0; i<allContainers.length; i++){
|
jpayne@68
|
129 // System.err.println("merging "+allContainers[i].name);
|
jpayne@68
|
130 allContainers[i].add(pgm.allContainers[i]);
|
jpayne@68
|
131 }
|
jpayne@68
|
132
|
jpayne@68
|
133 readsProcessed+=pgm.readsProcessed;
|
jpayne@68
|
134 basesProcessed+=pgm.basesProcessed;
|
jpayne@68
|
135 genesProcessed+=pgm.genesProcessed;
|
jpayne@68
|
136 filesProcessed+=pgm.filesProcessed;
|
jpayne@68
|
137
|
jpayne@68
|
138 // geneStartsProcessed+=pgm.geneStartsProcessed;
|
jpayne@68
|
139 // tRNAProcessed+=pgm.tRNAProcessed;
|
jpayne@68
|
140 // r16SProcessed+=pgm.r16SProcessed;
|
jpayne@68
|
141 // r23SProcessed+=pgm.r23SProcessed;
|
jpayne@68
|
142 // r5SProcessed+=pgm.r5SProcessed;
|
jpayne@68
|
143 // r18SProcessed+=pgm.r18SProcessed;
|
jpayne@68
|
144
|
jpayne@68
|
145 // fnames.addAll(pgm.fnames);
|
jpayne@68
|
146 numFiles+=pgm.numFiles;
|
jpayne@68
|
147 taxIds.addAll(pgm.taxIds);
|
jpayne@68
|
148 Tools.add(baseCounts, pgm.baseCounts);
|
jpayne@68
|
149 }
|
jpayne@68
|
150
|
jpayne@68
|
151 public void multiplyBy(double mult) {
|
jpayne@68
|
152 for(int i=0; i<allContainers.length; i++){
|
jpayne@68
|
153 allContainers[i].multiplyBy(mult);
|
jpayne@68
|
154 }
|
jpayne@68
|
155
|
jpayne@68
|
156 readsProcessed=Math.round(readsProcessed*mult);
|
jpayne@68
|
157 basesProcessed=Math.round(basesProcessed*mult);
|
jpayne@68
|
158 genesProcessed=Math.round(genesProcessed*mult);
|
jpayne@68
|
159 filesProcessed=Math.round(filesProcessed*mult);
|
jpayne@68
|
160
|
jpayne@68
|
161 for(int i=0; i<baseCounts.length; i++){
|
jpayne@68
|
162 baseCounts[i]=Math.round(baseCounts[i]*mult);
|
jpayne@68
|
163 }
|
jpayne@68
|
164 }
|
jpayne@68
|
165
|
jpayne@68
|
166 /*--------------------------------------------------------------*/
|
jpayne@68
|
167 /*---------------- Outer Methods ----------------*/
|
jpayne@68
|
168 /*--------------------------------------------------------------*/
|
jpayne@68
|
169
|
jpayne@68
|
170 public float gc(){
|
jpayne@68
|
171 long a=baseCounts[0];
|
jpayne@68
|
172 long c=baseCounts[1];
|
jpayne@68
|
173 long g=baseCounts[2];
|
jpayne@68
|
174 long t=baseCounts[3];
|
jpayne@68
|
175 return (float)((g+c)/Tools.max(1.0, a+t+g+c));
|
jpayne@68
|
176 }
|
jpayne@68
|
177
|
jpayne@68
|
178 HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){
|
jpayne@68
|
179 HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3);
|
jpayne@68
|
180 for(ScafData sd : scafList){scafMap.put(sd.name, sd);}
|
jpayne@68
|
181 for(ScafData sd : scafList){
|
jpayne@68
|
182 String name=sd.name;
|
jpayne@68
|
183 int idx=name.indexOf(' ');
|
jpayne@68
|
184 if(idx>=0){
|
jpayne@68
|
185 String prefix=name.substring(0, idx);
|
jpayne@68
|
186 if(scafMap.containsKey(prefix)){
|
jpayne@68
|
187 assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'";
|
jpayne@68
|
188 }else{
|
jpayne@68
|
189 scafMap.put(prefix, sd);
|
jpayne@68
|
190 }
|
jpayne@68
|
191 }
|
jpayne@68
|
192 }
|
jpayne@68
|
193 return scafMap;
|
jpayne@68
|
194 }
|
jpayne@68
|
195
|
jpayne@68
|
196 public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){
|
jpayne@68
|
197 if(!callCDS){return;}
|
jpayne@68
|
198 for(GffLine gline : cdsLines){
|
jpayne@68
|
199 ScafData sd=scafMap.get(gline.seqid);
|
jpayne@68
|
200 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
|
jpayne@68
|
201 sd.addCDS(gline);
|
jpayne@68
|
202 }
|
jpayne@68
|
203 }
|
jpayne@68
|
204
|
jpayne@68
|
205 public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){
|
jpayne@68
|
206 for(GffLine gline : rnaLines){
|
jpayne@68
|
207 ScafData sd=scafMap.get(gline.seqid);
|
jpayne@68
|
208 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
|
jpayne@68
|
209 if(processType(gline.prokType())){
|
jpayne@68
|
210 sd.addRNA(gline);
|
jpayne@68
|
211 }
|
jpayne@68
|
212 }
|
jpayne@68
|
213 }
|
jpayne@68
|
214
|
jpayne@68
|
215 public void processStrand(ArrayList<ScafData> scafList, int strand){
|
jpayne@68
|
216 for(ScafData sd : scafList){
|
jpayne@68
|
217 processCDS(sd, strand);
|
jpayne@68
|
218 processRNA(sd, strand);
|
jpayne@68
|
219 }
|
jpayne@68
|
220 }
|
jpayne@68
|
221
|
jpayne@68
|
222 /*--------------------------------------------------------------*/
|
jpayne@68
|
223 /*---------------- Inner Methods ----------------*/
|
jpayne@68
|
224 /*--------------------------------------------------------------*/
|
jpayne@68
|
225
|
jpayne@68
|
226 private void countBases(ArrayList<ScafData> scafList){
|
jpayne@68
|
227 for(ScafData sd : scafList){
|
jpayne@68
|
228 countBases(sd.bases);
|
jpayne@68
|
229 }
|
jpayne@68
|
230 }
|
jpayne@68
|
231
|
jpayne@68
|
232 private void countBases(byte[] bases){
|
jpayne@68
|
233 for(byte b : bases){
|
jpayne@68
|
234 int x=AminoAcid.baseToNumberACGTother[b];
|
jpayne@68
|
235 baseCounts[x]++;
|
jpayne@68
|
236 }
|
jpayne@68
|
237 }
|
jpayne@68
|
238
|
jpayne@68
|
239 /*--------------------------------------------------------------*/
|
jpayne@68
|
240 /*---------------- Finding Codons ----------------*/
|
jpayne@68
|
241 /*--------------------------------------------------------------*/
|
jpayne@68
|
242
|
jpayne@68
|
243 private static void findStopCodons(byte[] bases, IntList list, BitSet valid){
|
jpayne@68
|
244 final int k=3;
|
jpayne@68
|
245 final int mask=~((-1)<<(2*k));
|
jpayne@68
|
246 int kmer=0;
|
jpayne@68
|
247 int len=0;
|
jpayne@68
|
248
|
jpayne@68
|
249 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
250 byte b=bases[i];
|
jpayne@68
|
251 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
252 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
253 if(x>=0){
|
jpayne@68
|
254 len++;
|
jpayne@68
|
255 if(len>=k){
|
jpayne@68
|
256 int point=i;//End of the stop codon
|
jpayne@68
|
257 if(isStopCodon(kmer) && !valid.get(point)){
|
jpayne@68
|
258 list.add(point);
|
jpayne@68
|
259 valid.set(point);
|
jpayne@68
|
260 }
|
jpayne@68
|
261 }
|
jpayne@68
|
262 }else{len=0;}
|
jpayne@68
|
263 }
|
jpayne@68
|
264
|
jpayne@68
|
265 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
|
jpayne@68
|
266 if(!valid.get(i)){
|
jpayne@68
|
267 list.add(i);
|
jpayne@68
|
268 }
|
jpayne@68
|
269 }
|
jpayne@68
|
270 }
|
jpayne@68
|
271
|
jpayne@68
|
272 private static void findStartCodons(byte[] bases, IntList list, BitSet valid){
|
jpayne@68
|
273 final int k=3;
|
jpayne@68
|
274 final int mask=~((-1)<<(2*k));
|
jpayne@68
|
275 int kmer=0;
|
jpayne@68
|
276 int len=0;
|
jpayne@68
|
277
|
jpayne@68
|
278 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
279 byte b=bases[i];
|
jpayne@68
|
280 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
281 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
282 if(x>=0){
|
jpayne@68
|
283 len++;
|
jpayne@68
|
284 if(len>=k){
|
jpayne@68
|
285 int point=i-k+1;//Start of the start codon
|
jpayne@68
|
286 if(isStartCodon(kmer) && !valid.get(point)){
|
jpayne@68
|
287 list.add(point);
|
jpayne@68
|
288 valid.set(point);
|
jpayne@68
|
289 }
|
jpayne@68
|
290 }
|
jpayne@68
|
291 }else{len=0;}
|
jpayne@68
|
292 }
|
jpayne@68
|
293
|
jpayne@68
|
294 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
|
jpayne@68
|
295 if(!valid.get(i)){
|
jpayne@68
|
296 list.add(i);
|
jpayne@68
|
297 }
|
jpayne@68
|
298 }
|
jpayne@68
|
299 }
|
jpayne@68
|
300
|
jpayne@68
|
301 /*--------------------------------------------------------------*/
|
jpayne@68
|
302 /*---------------- Processing GffLines ----------------*/
|
jpayne@68
|
303 /*--------------------------------------------------------------*/
|
jpayne@68
|
304
|
jpayne@68
|
305 private static void processGene(GffLine gline, ScafData sd){
|
jpayne@68
|
306 if(gline.length()<2){return;}
|
jpayne@68
|
307 final int strand=gline.strand;
|
jpayne@68
|
308 assert(strand==sd.strand());
|
jpayne@68
|
309 final byte[] frames=sd.frames;
|
jpayne@68
|
310 int start=gline.start-1, stop=gline.stop-1;
|
jpayne@68
|
311 if(start<0 || stop>=sd.length()){return;}
|
jpayne@68
|
312 // assert(start<stop) : gline; //Not always true for euks...
|
jpayne@68
|
313 if(strand==Shared.MINUS){
|
jpayne@68
|
314 int x=sd.length()-start-1;
|
jpayne@68
|
315 int y=sd.length()-stop-1;
|
jpayne@68
|
316 start=y;
|
jpayne@68
|
317 stop=x;
|
jpayne@68
|
318
|
jpayne@68
|
319 // String a=new String(sd.bases, start, 3);
|
jpayne@68
|
320 // String b=new String(sd.bases, stop-2, 3);
|
jpayne@68
|
321 //// assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3);
|
jpayne@68
|
322 // outstream.println(a+", "+b+", "+start+", "+stop);
|
jpayne@68
|
323 }
|
jpayne@68
|
324 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
|
jpayne@68
|
325 markFrames(start, stop, frames, kInnerCDS);
|
jpayne@68
|
326 sd.starts.add(start);
|
jpayne@68
|
327 sd.stops.add(stop);
|
jpayne@68
|
328 // assert(gline.start!=337) : gline+"\n"+start+", "+stop;
|
jpayne@68
|
329 }
|
jpayne@68
|
330
|
jpayne@68
|
331 private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){
|
jpayne@68
|
332 final int strand=gline.strand;
|
jpayne@68
|
333 assert(strand==sd.strand());
|
jpayne@68
|
334 final byte[] frames=sd.frames;
|
jpayne@68
|
335 int start=gline.start-1, stop=gline.stop-1;
|
jpayne@68
|
336 if(start<0 || stop>=sd.length()){return false;}
|
jpayne@68
|
337 assert(start<=stop);
|
jpayne@68
|
338 if(strand==Shared.MINUS){
|
jpayne@68
|
339 int x=sd.length()-start-1;
|
jpayne@68
|
340 int y=sd.length()-stop-1;
|
jpayne@68
|
341 start=y;
|
jpayne@68
|
342 stop=x;
|
jpayne@68
|
343 }
|
jpayne@68
|
344
|
jpayne@68
|
345 if(AnalyzeGenes.alignRibo){
|
jpayne@68
|
346 // byte[] seq=sd.fetch(start, stop);
|
jpayne@68
|
347 Read[] consensusReads=ProkObject.consensusReads(type);
|
jpayne@68
|
348 byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null);
|
jpayne@68
|
349 float minIdentity=ProkObject.minID(type);
|
jpayne@68
|
350 if(universal!=null){
|
jpayne@68
|
351 int[] coords=KillSwitch.allocInt1D(2);
|
jpayne@68
|
352 final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50));
|
jpayne@68
|
353 final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50));
|
jpayne@68
|
354 float id1=align(universal, sd.bases, a, b, minIdentity, coords);
|
jpayne@68
|
355 final int rstart=coords[0], rstop=coords[1];
|
jpayne@68
|
356 // assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
|
jpayne@68
|
357 if(id1<minIdentity){
|
jpayne@68
|
358 // System.err.println("Low identity: "+String.format("%.2s", 100*id1));
|
jpayne@68
|
359 return false;
|
jpayne@68
|
360 }else{
|
jpayne@68
|
361 // System.err.println("Good identity: "+String.format("%.2s", 100*id1));
|
jpayne@68
|
362 }
|
jpayne@68
|
363 if(AnalyzeGenes.adjustEndpoints){
|
jpayne@68
|
364 int startSlop=startSlop(type);
|
jpayne@68
|
365 int stopSlop=stopSlop(type);
|
jpayne@68
|
366 if(Tools.absdif(start, rstart)>startSlop){
|
jpayne@68
|
367 // System.err.println("rstart:\t"+start+" -> "+rstart);
|
jpayne@68
|
368 start=rstart;
|
jpayne@68
|
369 }
|
jpayne@68
|
370 if(Tools.absdif(stop, rstop)>stopSlop){
|
jpayne@68
|
371 // System.err.println("rstop: \t"+stop+" -> "+rstop);
|
jpayne@68
|
372 stop=rstop;
|
jpayne@68
|
373 }
|
jpayne@68
|
374 }
|
jpayne@68
|
375 }
|
jpayne@68
|
376 }
|
jpayne@68
|
377
|
jpayne@68
|
378 StatsContainer sc=allContainers[type];
|
jpayne@68
|
379 sc.start.processPoint(sd.bases, start, 1);
|
jpayne@68
|
380 sc.stop.processPoint(sd.bases, stop, 1);
|
jpayne@68
|
381 assert(sc!=statsCDS);
|
jpayne@68
|
382
|
jpayne@68
|
383 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
|
jpayne@68
|
384 final byte flag=typeToFlag(type);
|
jpayne@68
|
385 for(int i=start+kInnerRNA-1; i<=stop; i++){
|
jpayne@68
|
386 frames[i]|=flag;
|
jpayne@68
|
387 }
|
jpayne@68
|
388 return true;
|
jpayne@68
|
389 }
|
jpayne@68
|
390
|
jpayne@68
|
391 private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){
|
jpayne@68
|
392 // final int a=0, b=ref.length-1;
|
jpayne@68
|
393 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA();
|
jpayne@68
|
394 final int minScore=ssa.minScoreByIdentity(query.length, minIdentity);
|
jpayne@68
|
395 int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore);
|
jpayne@68
|
396 if(max==null){return 0;}
|
jpayne@68
|
397
|
jpayne@68
|
398 final int rows=max[0];
|
jpayne@68
|
399 final int maxCol=max[1];
|
jpayne@68
|
400 final int maxState=max[2];
|
jpayne@68
|
401 // final int maxScore=max[3];
|
jpayne@68
|
402 // final int maxStart=max[4];
|
jpayne@68
|
403
|
jpayne@68
|
404 //returns {score, bestRefStart, bestRefStop}
|
jpayne@68
|
405 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
|
jpayne@68
|
406 final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState);
|
jpayne@68
|
407 final int rstart=Tools.max(score[1], 0);
|
jpayne@68
|
408 final int rstop=Tools.min(score[2], ref.length-1);
|
jpayne@68
|
409 if(coords!=null){
|
jpayne@68
|
410 coords[0]=rstart;
|
jpayne@68
|
411 coords[1]=rstop;
|
jpayne@68
|
412 }
|
jpayne@68
|
413 final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null);
|
jpayne@68
|
414 return id;
|
jpayne@68
|
415 }
|
jpayne@68
|
416
|
jpayne@68
|
417 /**
|
jpayne@68
|
418 * Each frame byte has a bit marked for valid coding frames.
|
jpayne@68
|
419 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon.
|
jpayne@68
|
420 * If frames[23]=0, then no coding kmer end at that location on this strand.
|
jpayne@68
|
421 * @param start
|
jpayne@68
|
422 * @param stop
|
jpayne@68
|
423 * @param frames
|
jpayne@68
|
424 * @param k
|
jpayne@68
|
425 */
|
jpayne@68
|
426 private static void markFrames(int start, int stop, byte[] frames, int k){
|
jpayne@68
|
427 assert(start<=stop) : start+", "+stop;
|
jpayne@68
|
428 for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){
|
jpayne@68
|
429 frames[i]=(byte)(frames[i]|frameBit);
|
jpayne@68
|
430 frameBit<<=1;
|
jpayne@68
|
431 if(frameBit>4){frameBit=1;}
|
jpayne@68
|
432 }
|
jpayne@68
|
433 // assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct
|
jpayne@68
|
434 }
|
jpayne@68
|
435
|
jpayne@68
|
436 /*--------------------------------------------------------------*/
|
jpayne@68
|
437 /*---------------- Counting Kmers ----------------*/
|
jpayne@68
|
438 /*--------------------------------------------------------------*/
|
jpayne@68
|
439
|
jpayne@68
|
440 private void processCDS(ScafData sd, int strand){
|
jpayne@68
|
441 if(!callCDS){return;}
|
jpayne@68
|
442 ArrayList<GffLine> glines=sd.cdsLines[strand];
|
jpayne@68
|
443 for(GffLine gline : glines){
|
jpayne@68
|
444 assert(gline.strand==strand);
|
jpayne@68
|
445 processGene(gline, sd);
|
jpayne@68
|
446 statsCDS.addLength(gline.length());
|
jpayne@68
|
447 }
|
jpayne@68
|
448
|
jpayne@68
|
449 statsCDS.inner.processCDSFrames(sd.bases, sd.frames);
|
jpayne@68
|
450 BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1);
|
jpayne@68
|
451 BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1);
|
jpayne@68
|
452 // outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops.");
|
jpayne@68
|
453 sd.clear();
|
jpayne@68
|
454 findStartCodons(sd.bases, sd.starts, startSet);
|
jpayne@68
|
455 findStopCodons(sd.bases, sd.stops, stopSet);
|
jpayne@68
|
456 // outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops.");
|
jpayne@68
|
457 processEnds(sd.bases, statsCDS.start, sd.starts, 0);
|
jpayne@68
|
458 processEnds(sd.bases, statsCDS.stop, sd.stops, 0);
|
jpayne@68
|
459 }
|
jpayne@68
|
460
|
jpayne@68
|
461 private static int getGlineType(GffLine gline, ScafData sd){
|
jpayne@68
|
462 if(!gline.inbounds(sd.length()) || gline.partial()){return -1;}
|
jpayne@68
|
463
|
jpayne@68
|
464 final int length=gline.length();
|
jpayne@68
|
465 final int type=gline.prokType();
|
jpayne@68
|
466 if(type<0){
|
jpayne@68
|
467 return type;
|
jpayne@68
|
468 }else if(type==CDS){
|
jpayne@68
|
469 return type;
|
jpayne@68
|
470 }else if(type==tRNA && length>=40 && length<=120){
|
jpayne@68
|
471 return type;
|
jpayne@68
|
472 }else if(type==r16S && length>=1440 && length<=1620){
|
jpayne@68
|
473 return type;
|
jpayne@68
|
474 }else if(type==r23S && length>=2720 && length<=3170){
|
jpayne@68
|
475 return type;
|
jpayne@68
|
476 }else if(type==r5S && length>=90 && length<=150){
|
jpayne@68
|
477 return type;
|
jpayne@68
|
478 }else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range
|
jpayne@68
|
479 return type;
|
jpayne@68
|
480 }
|
jpayne@68
|
481 return -1;
|
jpayne@68
|
482 }
|
jpayne@68
|
483
|
jpayne@68
|
484 private void processRNA(ScafData sd, int strand){
|
jpayne@68
|
485 sd.clear();
|
jpayne@68
|
486 ArrayList<GffLine> lines=sd.rnaLines[strand];
|
jpayne@68
|
487 for(GffLine gline : lines){
|
jpayne@68
|
488 assert(gline.strand==strand);
|
jpayne@68
|
489 final int type=getGlineType(gline, sd);
|
jpayne@68
|
490 if(type>0){
|
jpayne@68
|
491 StatsContainer sc=allContainers[type];
|
jpayne@68
|
492 sc.addLength(gline.length());
|
jpayne@68
|
493 processRnaLine(gline, sd, type);
|
jpayne@68
|
494 }
|
jpayne@68
|
495 }
|
jpayne@68
|
496 processRnaInner(sd);
|
jpayne@68
|
497 processRnaEnds(sd);
|
jpayne@68
|
498 }
|
jpayne@68
|
499
|
jpayne@68
|
500 void processRnaInner(ScafData sd){
|
jpayne@68
|
501 byte[] bases=sd.bases;
|
jpayne@68
|
502 byte[] frames=sd.frames;
|
jpayne@68
|
503 final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs.
|
jpayne@68
|
504 final int mask=~((-1)<<(2*k));
|
jpayne@68
|
505 int kmer=0;
|
jpayne@68
|
506 int len=0;
|
jpayne@68
|
507
|
jpayne@68
|
508 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
509 byte b=bases[i];
|
jpayne@68
|
510 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
511 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
512 if(x>=0){
|
jpayne@68
|
513 len++;
|
jpayne@68
|
514 if(len>=k){
|
jpayne@68
|
515 int vf=frames[i];
|
jpayne@68
|
516 for(int type=0; type<5; type++){
|
jpayne@68
|
517 int valid=vf&1;
|
jpayne@68
|
518 rnaContainers[type].inner.add(kmer, 0, valid);
|
jpayne@68
|
519 vf=(vf>>1);
|
jpayne@68
|
520 }
|
jpayne@68
|
521 }
|
jpayne@68
|
522 }else{len=0;}
|
jpayne@68
|
523 }
|
jpayne@68
|
524 }
|
jpayne@68
|
525
|
jpayne@68
|
526 void processRnaEnds(ScafData sd){
|
jpayne@68
|
527 byte[] bases=sd.bases;
|
jpayne@68
|
528
|
jpayne@68
|
529 final int k=stats16S.start.k;
|
jpayne@68
|
530 final int kMax=stats16S.start.kMax;
|
jpayne@68
|
531 final int mask=stats16S.start.mask;
|
jpayne@68
|
532 final long[] counts=new long[kMax];//TODO: Slow
|
jpayne@68
|
533
|
jpayne@68
|
534 int kmer=0;
|
jpayne@68
|
535 int len=0;
|
jpayne@68
|
536
|
jpayne@68
|
537 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
538 byte b=bases[i];
|
jpayne@68
|
539 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
540 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
541
|
jpayne@68
|
542 if(x>=0){
|
jpayne@68
|
543 len++;
|
jpayne@68
|
544 if(len>=k){
|
jpayne@68
|
545 counts[kmer]++;
|
jpayne@68
|
546 }
|
jpayne@68
|
547 }else{len=0;}
|
jpayne@68
|
548 }
|
jpayne@68
|
549 for(StatsContainer sc : rnaContainers){
|
jpayne@68
|
550 FrameStats fs=sc.start;
|
jpayne@68
|
551 for(long[] array : fs.countsFalse){
|
jpayne@68
|
552 Tools.add(array, counts);
|
jpayne@68
|
553 }
|
jpayne@68
|
554 fs=sc.stop;
|
jpayne@68
|
555 for(long[] array : fs.countsFalse){
|
jpayne@68
|
556 Tools.add(array, counts);
|
jpayne@68
|
557 }
|
jpayne@68
|
558 }
|
jpayne@68
|
559 }
|
jpayne@68
|
560
|
jpayne@68
|
561 private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){
|
jpayne@68
|
562 BitSet points=new BitSet(bases.length);
|
jpayne@68
|
563 for(int i=0; i<list.size; i++){
|
jpayne@68
|
564 int point=list.get(i);
|
jpayne@68
|
565 stats.processPoint(bases, list.get(i), valid);
|
jpayne@68
|
566 points.set(point);
|
jpayne@68
|
567 }
|
jpayne@68
|
568 return points;
|
jpayne@68
|
569 }
|
jpayne@68
|
570
|
jpayne@68
|
571 /*--------------------------------------------------------------*/
|
jpayne@68
|
572 /*---------------- Scoring ----------------*/
|
jpayne@68
|
573 /*--------------------------------------------------------------*/
|
jpayne@68
|
574
|
jpayne@68
|
575 // //Assumes bases are in the correct strand
|
jpayne@68
|
576 // public float calcStartScore(int start, int stop, byte[] bases){
|
jpayne@68
|
577 // float f=scorePoint(start, bases, startStats);
|
jpayne@68
|
578 //// float ss=scoreStart2(start, bases, stop, innerKmerStats);
|
jpayne@68
|
579 //// if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study.
|
jpayne@68
|
580 // return f;
|
jpayne@68
|
581 // }
|
jpayne@68
|
582 //
|
jpayne@68
|
583 // //Assumes bases are in the correct strand
|
jpayne@68
|
584 // public float calcStopScore(int stop, byte[] bases){
|
jpayne@68
|
585 // float f=scorePoint(stop, bases, stopStats);
|
jpayne@68
|
586 // return f;
|
jpayne@68
|
587 // }
|
jpayne@68
|
588 //
|
jpayne@68
|
589 // //Assumes bases are in the correct strand
|
jpayne@68
|
590 // public float calcRnaStartScore(int start, int stop, byte[] bases){
|
jpayne@68
|
591 // float f=scorePoint(start, bases, rrnaStartStats);
|
jpayne@68
|
592 // return f;
|
jpayne@68
|
593 // }
|
jpayne@68
|
594 //
|
jpayne@68
|
595 // //Assumes bases are in the correct strand
|
jpayne@68
|
596 // public float calcRnaStopScore(int stop, byte[] bases){
|
jpayne@68
|
597 // float f=scorePoint(stop, bases, rrnaStopStats);
|
jpayne@68
|
598 // return f;
|
jpayne@68
|
599 // }
|
jpayne@68
|
600
|
jpayne@68
|
601 // public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){
|
jpayne@68
|
602 //
|
jpayne@68
|
603 // assert(stats.frames==3);
|
jpayne@68
|
604 // final int k=stats.k;
|
jpayne@68
|
605 // final int mask=~((-1)<<(2*k));
|
jpayne@68
|
606 //
|
jpayne@68
|
607 // int kmer=0;
|
jpayne@68
|
608 // int len=0;
|
jpayne@68
|
609 // float score=0;
|
jpayne@68
|
610 // int numKmers=0;
|
jpayne@68
|
611 //
|
jpayne@68
|
612 // for(int pos=start, currentFrame=startFrame; pos<stop; pos++){
|
jpayne@68
|
613 // final byte b=bases[pos];
|
jpayne@68
|
614 // final int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
615 //
|
jpayne@68
|
616 // if(x>=0){
|
jpayne@68
|
617 // kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
618 // len++;
|
jpayne@68
|
619 // if(len>=k){
|
jpayne@68
|
620 // float prob=stats.probs[currentFrame][kmer];
|
jpayne@68
|
621 // float dif=prob-0.99f;//Prob above 1 is more likely than average
|
jpayne@68
|
622 // score+=dif;
|
jpayne@68
|
623 // numKmers++;
|
jpayne@68
|
624 // }
|
jpayne@68
|
625 // }else{
|
jpayne@68
|
626 // len=0;
|
jpayne@68
|
627 // kmer=0;
|
jpayne@68
|
628 // }
|
jpayne@68
|
629 //
|
jpayne@68
|
630 // currentFrame++;
|
jpayne@68
|
631 // if(currentFrame>2){currentFrame=0;}
|
jpayne@68
|
632 // }
|
jpayne@68
|
633 // return score/Tools.max(1f, numKmers);
|
jpayne@68
|
634 // }
|
jpayne@68
|
635 //
|
jpayne@68
|
636 // /**
|
jpayne@68
|
637 // * TODO
|
jpayne@68
|
638 // * Evaluate the relative difference between left and right frequencies.
|
jpayne@68
|
639 // * The purpose is to find locations where the left side looks noncoding and the right side looks coding.
|
jpayne@68
|
640 // * Does not currently yield useful results.
|
jpayne@68
|
641 // */
|
jpayne@68
|
642 // public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){
|
jpayne@68
|
643 // final int k=stats.k;
|
jpayne@68
|
644 //
|
jpayne@68
|
645 // int start=point-45;
|
jpayne@68
|
646 // if(start<0 || stop>bases.length){return 0.5f;}
|
jpayne@68
|
647 //
|
jpayne@68
|
648 // float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats);
|
jpayne@68
|
649 // float right=calcKmerScore(point, stop-3, 0, bases, stats);
|
jpayne@68
|
650 // return right-left; //High numbers are likely to be starts; non-starts should be near 0.
|
jpayne@68
|
651 // }
|
jpayne@68
|
652
|
jpayne@68
|
653 /*--------------------------------------------------------------*/
|
jpayne@68
|
654 /*---------------- toString ----------------*/
|
jpayne@68
|
655 /*--------------------------------------------------------------*/
|
jpayne@68
|
656
|
jpayne@68
|
657 @Override
|
jpayne@68
|
658 public String toString(){
|
jpayne@68
|
659 return appendTo(new ByteBuilder()).toString();
|
jpayne@68
|
660 }
|
jpayne@68
|
661
|
jpayne@68
|
662 public ByteBuilder appendTo(ByteBuilder bb){
|
jpayne@68
|
663
|
jpayne@68
|
664 // Collections.sort(fnames);
|
jpayne@68
|
665 taxIds.sort();
|
jpayne@68
|
666
|
jpayne@68
|
667 bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n");
|
jpayne@68
|
668 bb.append("#files");
|
jpayne@68
|
669 bb.tab().append(numFiles);
|
jpayne@68
|
670 // if(fnames.size()>5){
|
jpayne@68
|
671 // bb.tab().append(fnames.size());
|
jpayne@68
|
672 // }else{
|
jpayne@68
|
673 // for(String fname : fnames){
|
jpayne@68
|
674 // bb.tab().append(fname);
|
jpayne@68
|
675 // }
|
jpayne@68
|
676 // }
|
jpayne@68
|
677 bb.nl();
|
jpayne@68
|
678 bb.append("#taxIDs");
|
jpayne@68
|
679 for(int i=0; i<taxIds.size; i++){
|
jpayne@68
|
680 bb.tab().append(taxIds.get(i));
|
jpayne@68
|
681 }
|
jpayne@68
|
682 bb.nl();
|
jpayne@68
|
683 // bb.append("#k_inner\t").append(innerKmerLength).nl();
|
jpayne@68
|
684 // bb.append("#k_end\t").append(endKmerLength).nl();
|
jpayne@68
|
685 // bb.append("#start_left_offset\t").append(startLeftOffset).nl();
|
jpayne@68
|
686 // bb.append("#start_right_offset\t").append(startRightOffset).nl();
|
jpayne@68
|
687 // bb.append("#stop_left_offset\t").append(stopLeftOffset).nl();
|
jpayne@68
|
688 // bb.append("#stop_right_offset\t").append(stopRightOffset).nl();
|
jpayne@68
|
689 bb.append("#scaffolds\t").append(readsProcessed).nl();
|
jpayne@68
|
690 bb.append("#bases\t").append(basesProcessed).nl();
|
jpayne@68
|
691 bb.append("#genes\t").append(genesProcessed).nl();
|
jpayne@68
|
692 bb.append("#GC\t").append(gc(),2).nl();
|
jpayne@68
|
693 bb.append("#ACGTN");
|
jpayne@68
|
694 for(long x : baseCounts){
|
jpayne@68
|
695 bb.tab().append(x);
|
jpayne@68
|
696 }
|
jpayne@68
|
697 bb.nl();
|
jpayne@68
|
698
|
jpayne@68
|
699 for(StatsContainer sc : allContainers){
|
jpayne@68
|
700 sc.appendTo(bb);
|
jpayne@68
|
701 }
|
jpayne@68
|
702 assert(allContainers.length>5) : allContainers.length;
|
jpayne@68
|
703
|
jpayne@68
|
704 return bb;
|
jpayne@68
|
705 }
|
jpayne@68
|
706
|
jpayne@68
|
707 /*--------------------------------------------------------------*/
|
jpayne@68
|
708 /*---------------- Stats ----------------*/
|
jpayne@68
|
709 /*--------------------------------------------------------------*/
|
jpayne@68
|
710
|
jpayne@68
|
711 public final StatsContainer statsCDS=new StatsContainer(CDS);
|
jpayne@68
|
712 public final StatsContainer statstRNA=new StatsContainer(tRNA);
|
jpayne@68
|
713 public final StatsContainer stats16S=new StatsContainer(r16S);
|
jpayne@68
|
714 public final StatsContainer stats23S=new StatsContainer(r23S);
|
jpayne@68
|
715 public final StatsContainer stats5S=new StatsContainer(r5S);
|
jpayne@68
|
716 public final StatsContainer stats18S=new StatsContainer(r18S);
|
jpayne@68
|
717
|
jpayne@68
|
718 final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S};
|
jpayne@68
|
719 final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S};
|
jpayne@68
|
720 //public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
|
jpayne@68
|
721
|
jpayne@68
|
722 // public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0);
|
jpayne@68
|
723 // public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset);
|
jpayne@68
|
724 // public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset);
|
jpayne@68
|
725 //
|
jpayne@68
|
726 // public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8);
|
jpayne@68
|
727 // public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8);
|
jpayne@68
|
728 //
|
jpayne@68
|
729 // public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0);
|
jpayne@68
|
730 // public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0);
|
jpayne@68
|
731 // public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0);
|
jpayne@68
|
732 // public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0);
|
jpayne@68
|
733 // public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats};
|
jpayne@68
|
734
|
jpayne@68
|
735 /*--------------------------------------------------------------*/
|
jpayne@68
|
736
|
jpayne@68
|
737 // public ArrayList<String> fnames=new ArrayList<String>();
|
jpayne@68
|
738 public int numFiles=0;
|
jpayne@68
|
739 public IntList taxIds=new IntList();
|
jpayne@68
|
740
|
jpayne@68
|
741
|
jpayne@68
|
742 /*--------------------------------------------------------------*/
|
jpayne@68
|
743 /*---------------- Fields ----------------*/
|
jpayne@68
|
744 /*--------------------------------------------------------------*/
|
jpayne@68
|
745
|
jpayne@68
|
746 private long maxReads=-1;
|
jpayne@68
|
747
|
jpayne@68
|
748 long readsProcessed=0;
|
jpayne@68
|
749 long basesProcessed=0;
|
jpayne@68
|
750 long genesProcessed=0;
|
jpayne@68
|
751 long filesProcessed=0;
|
jpayne@68
|
752 long[] baseCounts=new long[5];
|
jpayne@68
|
753
|
jpayne@68
|
754 /*--------------------------------------------------------------*/
|
jpayne@68
|
755 /*---------------- Static Setters ----------------*/
|
jpayne@68
|
756 /*--------------------------------------------------------------*/
|
jpayne@68
|
757
|
jpayne@68
|
758 public synchronized void setStatics(){
|
jpayne@68
|
759 // assert(!setStatics);
|
jpayne@68
|
760 kInnerCDS=statsCDS.inner.k;
|
jpayne@68
|
761 kStartCDS=statsCDS.start.k;
|
jpayne@68
|
762 kStopCDS=statsCDS.stop.k;
|
jpayne@68
|
763
|
jpayne@68
|
764 setStartLeftOffset(statsCDS.start.leftOffset);
|
jpayne@68
|
765 setStartRightOffset(statsCDS.start.rightOffset());
|
jpayne@68
|
766
|
jpayne@68
|
767 setStopLeftOffset(statsCDS.stop.leftOffset);
|
jpayne@68
|
768 setStopRightOffset(statsCDS.stop.rightOffset());
|
jpayne@68
|
769
|
jpayne@68
|
770 kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here?
|
jpayne@68
|
771 kStartRNA=stats16S.start.k;
|
jpayne@68
|
772 kStopRNA=stats16S.stop.k;
|
jpayne@68
|
773 setStatics=true;
|
jpayne@68
|
774 }
|
jpayne@68
|
775
|
jpayne@68
|
776 public static void setInnerK(int k){
|
jpayne@68
|
777 kInnerCDS=k;
|
jpayne@68
|
778 }
|
jpayne@68
|
779
|
jpayne@68
|
780 public static void setStartK(int k){
|
jpayne@68
|
781 kStartCDS=k;
|
jpayne@68
|
782 }
|
jpayne@68
|
783
|
jpayne@68
|
784 public static void setStopK(int k){
|
jpayne@68
|
785 kStopCDS=k;
|
jpayne@68
|
786 }
|
jpayne@68
|
787
|
jpayne@68
|
788 public static void setStartLeftOffset(int x){
|
jpayne@68
|
789 startLeftOffset=x;
|
jpayne@68
|
790 startFrames=startLeftOffset+startRightOffset+1;
|
jpayne@68
|
791 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
|
jpayne@68
|
792 }
|
jpayne@68
|
793
|
jpayne@68
|
794 public static void setStartRightOffset(int x){
|
jpayne@68
|
795 startRightOffset=x;
|
jpayne@68
|
796 startFrames=startLeftOffset+startRightOffset+1;
|
jpayne@68
|
797 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
|
jpayne@68
|
798 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
|
jpayne@68
|
799 }
|
jpayne@68
|
800
|
jpayne@68
|
801 public static void setStopLeftOffset(int x){
|
jpayne@68
|
802 stopLeftOffset=x;
|
jpayne@68
|
803 stopFrames=stopLeftOffset+stopRightOffset+1;
|
jpayne@68
|
804 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
|
jpayne@68
|
805 }
|
jpayne@68
|
806
|
jpayne@68
|
807 public static void setStopRightOffset(int x){
|
jpayne@68
|
808 stopRightOffset=x;
|
jpayne@68
|
809 stopFrames=stopLeftOffset+stopRightOffset+1;
|
jpayne@68
|
810 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
|
jpayne@68
|
811 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
|
jpayne@68
|
812 }
|
jpayne@68
|
813
|
jpayne@68
|
814 public static final boolean isStartCodon(int code){
|
jpayne@68
|
815 return code>=0 && code<=63 && isStartCodon[code];
|
jpayne@68
|
816 }
|
jpayne@68
|
817 public static final boolean isStopCodon(int code){
|
jpayne@68
|
818 return code>=0 && code<=63 && isStopCodon[code];
|
jpayne@68
|
819 }
|
jpayne@68
|
820
|
jpayne@68
|
821 /*--------------------------------------------------------------*/
|
jpayne@68
|
822 /*---------------- Class Init ----------------*/
|
jpayne@68
|
823 /*--------------------------------------------------------------*/
|
jpayne@68
|
824
|
jpayne@68
|
825 private static boolean[] makeIsCodon(String[] codons){
|
jpayne@68
|
826 boolean[] array=new boolean[64];
|
jpayne@68
|
827 for(String s : codons){
|
jpayne@68
|
828 int x=AminoAcid.toNumber(s);
|
jpayne@68
|
829 array[x]=true;
|
jpayne@68
|
830 }
|
jpayne@68
|
831 return array;
|
jpayne@68
|
832 }
|
jpayne@68
|
833
|
jpayne@68
|
834 public static int kInnerCDS=6;
|
jpayne@68
|
835 public static int kStartCDS=3;
|
jpayne@68
|
836 public static int kStopCDS=3;
|
jpayne@68
|
837
|
jpayne@68
|
838 static int startLeftOffset(){return startLeftOffset;}
|
jpayne@68
|
839 static int startRightOffset(){return startRightOffset;}
|
jpayne@68
|
840 static int startFrames(){return startFrames;}
|
jpayne@68
|
841
|
jpayne@68
|
842 private static int startLeftOffset=21; //21 works well for k=4
|
jpayne@68
|
843 private static int startRightOffset=8; //10 works well for k=4
|
jpayne@68
|
844 private static int startFrames=startLeftOffset+startRightOffset+1;
|
jpayne@68
|
845
|
jpayne@68
|
846 private static int stopLeftOffset=9;
|
jpayne@68
|
847 private static int stopRightOffset=12;
|
jpayne@68
|
848 private static int stopFrames=stopLeftOffset+stopRightOffset+1;
|
jpayne@68
|
849
|
jpayne@68
|
850 private static boolean setStatics=false;
|
jpayne@68
|
851
|
jpayne@68
|
852 /*--------------------------------------------------------------*/
|
jpayne@68
|
853 /*---------------- More Statics ----------------*/
|
jpayne@68
|
854 /*--------------------------------------------------------------*/
|
jpayne@68
|
855
|
jpayne@68
|
856 //E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9]
|
jpayne@68
|
857 public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"};
|
jpayne@68
|
858 public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"};
|
jpayne@68
|
859 public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"};
|
jpayne@68
|
860 public static boolean[] isStartCodon=makeIsCodon(startCodons);
|
jpayne@68
|
861 public static boolean[] isStopCodon=makeIsCodon(stopCodons);
|
jpayne@68
|
862
|
jpayne@68
|
863 /*--------------------------------------------------------------*/
|
jpayne@68
|
864
|
jpayne@68
|
865 private static PrintStream outstream=System.err;
|
jpayne@68
|
866 public static boolean verbose=false;
|
jpayne@68
|
867 public static boolean errorState=false;
|
jpayne@68
|
868
|
jpayne@68
|
869 }
|