jpayne@68
|
1 package prok;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.PrintStream;
|
jpayne@68
|
4 import java.util.ArrayList;
|
jpayne@68
|
5 import java.util.Arrays;
|
jpayne@68
|
6 import java.util.Collections;
|
jpayne@68
|
7
|
jpayne@68
|
8 import aligner.SingleStateAlignerFlat2;
|
jpayne@68
|
9 import aligner.SingleStateAlignerFlat3;
|
jpayne@68
|
10 import aligner.SingleStateAlignerFlatFloat;
|
jpayne@68
|
11 import dna.AminoAcid;
|
jpayne@68
|
12 import shared.KillSwitch;
|
jpayne@68
|
13 import shared.Tools;
|
jpayne@68
|
14 import stream.Read;
|
jpayne@68
|
15 import structures.FloatList;
|
jpayne@68
|
16 import structures.IntList;
|
jpayne@68
|
17 import structures.LongHashSet;
|
jpayne@68
|
18
|
jpayne@68
|
19
|
jpayne@68
|
20 /**
|
jpayne@68
|
21 * This class calls genes within a single thread.
|
jpayne@68
|
22 * @author Brian Bushnell
|
jpayne@68
|
23 * @date Sep 24, 2018
|
jpayne@68
|
24 *
|
jpayne@68
|
25 */
|
jpayne@68
|
26 public class GeneCaller extends ProkObject {
|
jpayne@68
|
27
|
jpayne@68
|
28 /*--------------------------------------------------------------*/
|
jpayne@68
|
29 /*---------------- Init ----------------*/
|
jpayne@68
|
30 /*--------------------------------------------------------------*/
|
jpayne@68
|
31
|
jpayne@68
|
32 GeneCaller(int minLen_, int maxOverlapSameStrand_, int maxOverlapOppositeStrand_,
|
jpayne@68
|
33 float minStartScore_, float minStopScore_, float minInnerScore_,
|
jpayne@68
|
34 float minOrfScore_, float minAvgScore_, GeneModel pgm_){
|
jpayne@68
|
35 minLen=minLen_;
|
jpayne@68
|
36 maxOverlapSameStrand=maxOverlapSameStrand_;
|
jpayne@68
|
37 maxOverlapOppositeStrand=maxOverlapOppositeStrand_;
|
jpayne@68
|
38 pgm=pgm_;
|
jpayne@68
|
39
|
jpayne@68
|
40 minStartScore=minStartScore_;
|
jpayne@68
|
41 minStopScore=minStopScore_;
|
jpayne@68
|
42 minInnerScore=minInnerScore_;
|
jpayne@68
|
43 minOrfScore=minOrfScore_;
|
jpayne@68
|
44 minAvgScore=minAvgScore_;
|
jpayne@68
|
45 }
|
jpayne@68
|
46
|
jpayne@68
|
47 /*--------------------------------------------------------------*/
|
jpayne@68
|
48 /*---------------- Outer Methods ----------------*/
|
jpayne@68
|
49 /*--------------------------------------------------------------*/
|
jpayne@68
|
50
|
jpayne@68
|
51 public ArrayList<Orf> callGenes(Read r){
|
jpayne@68
|
52 return callGenes(r, pgm);
|
jpayne@68
|
53 }
|
jpayne@68
|
54
|
jpayne@68
|
55 public ArrayList<Orf> callGenes(Read r, GeneModel pgm_){
|
jpayne@68
|
56 pgm=pgm_;
|
jpayne@68
|
57
|
jpayne@68
|
58 final String name=r.id;
|
jpayne@68
|
59 final byte[] bases=r.bases;
|
jpayne@68
|
60
|
jpayne@68
|
61 //Lists of all longest orfs per frame
|
jpayne@68
|
62 ArrayList<Orf>[] frameLists=makeOrfs(name, bases, minLen);
|
jpayne@68
|
63 //Lists of all high-scoring orfs per frame, with potentially multiple orfs sharing stops.
|
jpayne@68
|
64 ArrayList<Orf>[] brokenLists=breakOrfs(frameLists, bases);
|
jpayne@68
|
65
|
jpayne@68
|
66 ArrayList<Orf>[] rnaLists=null;
|
jpayne@68
|
67 final int rlen=r.length();
|
jpayne@68
|
68 if(calltRNA || (call16S && rlen>800) || (call23S && rlen>1500) || call5S || (call18S && rlen>1000)){
|
jpayne@68
|
69 rnaLists=makeRnas(name, bases);
|
jpayne@68
|
70
|
jpayne@68
|
71 brokenLists[0].addAll(rnaLists[0]);
|
jpayne@68
|
72 brokenLists[3].addAll(rnaLists[1]);
|
jpayne@68
|
73 Collections.sort(brokenLists[0]);
|
jpayne@68
|
74 Collections.sort(brokenLists[3]);
|
jpayne@68
|
75 }
|
jpayne@68
|
76
|
jpayne@68
|
77 boolean printAllOrfs=false;
|
jpayne@68
|
78 boolean printRnas=false;
|
jpayne@68
|
79 if(printAllOrfs){
|
jpayne@68
|
80 ArrayList<Orf> temp=new ArrayList<Orf>();
|
jpayne@68
|
81 for(ArrayList<Orf> broken : brokenLists){
|
jpayne@68
|
82 temp.addAll(broken);
|
jpayne@68
|
83 }
|
jpayne@68
|
84 Collections.sort(temp);
|
jpayne@68
|
85 return temp;
|
jpayne@68
|
86 }
|
jpayne@68
|
87
|
jpayne@68
|
88 if(printRnas && rnaLists!=null){
|
jpayne@68
|
89 ArrayList<Orf> temp=new ArrayList<Orf>();
|
jpayne@68
|
90 for(ArrayList<Orf> list : rnaLists){
|
jpayne@68
|
91 temp.addAll(list);
|
jpayne@68
|
92 }
|
jpayne@68
|
93 Collections.sort(temp);
|
jpayne@68
|
94 return temp;
|
jpayne@68
|
95 }
|
jpayne@68
|
96
|
jpayne@68
|
97 stCds2.add(brokenLists);
|
jpayne@68
|
98
|
jpayne@68
|
99 //Find the optimal path through Orfs
|
jpayne@68
|
100 ArrayList<Orf> path=findPath(brokenLists, bases);
|
jpayne@68
|
101 // geneStartsOut+=path.size();
|
jpayne@68
|
102
|
jpayne@68
|
103 if(callCDS){stCdsPass.add(path);}
|
jpayne@68
|
104 if(calltRNA){sttRNA.add(path);}
|
jpayne@68
|
105 if(call16S){st16s.add(path);}
|
jpayne@68
|
106 if(call23S){st23s.add(path);}
|
jpayne@68
|
107 if(call5S){st5s.add(path);}
|
jpayne@68
|
108 if(call18S){st18s.add(path);}
|
jpayne@68
|
109
|
jpayne@68
|
110 return path;
|
jpayne@68
|
111 }
|
jpayne@68
|
112
|
jpayne@68
|
113 /*--------------------------------------------------------------*/
|
jpayne@68
|
114 /*---------------- Inner Methods ----------------*/
|
jpayne@68
|
115 /*--------------------------------------------------------------*/
|
jpayne@68
|
116
|
jpayne@68
|
117 /**
|
jpayne@68
|
118 * Generates lists of all max-length non-overlapping Orfs per frame.
|
jpayne@68
|
119 * There IS overlap between frames.
|
jpayne@68
|
120 * All Orfs come out flipped to + orientation.
|
jpayne@68
|
121 * */
|
jpayne@68
|
122 ArrayList<Orf>[] makeRnas(String name, byte[] bases){
|
jpayne@68
|
123 @SuppressWarnings("unchecked")
|
jpayne@68
|
124 ArrayList<Orf>[] array=new ArrayList[2];
|
jpayne@68
|
125 array[0]=new ArrayList<Orf>();
|
jpayne@68
|
126 array[1]=new ArrayList<Orf>();
|
jpayne@68
|
127 final float[] scores=new float[bases.length];
|
jpayne@68
|
128 final int[] kmersSeen=(lsuKmers==null && ssuKmers==null && trnaKmers==null && r5SKmers==null) ? null : new int[bases.length];
|
jpayne@68
|
129 for(int strand=0; strand<2; strand++){
|
jpayne@68
|
130 for(StatsContainer sc : pgm.rnaContainers){
|
jpayne@68
|
131 if(ProkObject.callType(sc.type)){
|
jpayne@68
|
132 ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, (sc.kmerSet()==null ? null : kmersSeen), false, -1);//TODO: Make this loop through all RNA types
|
jpayne@68
|
133 if(strand==1 && list!=null){
|
jpayne@68
|
134 for(Orf orf : list){
|
jpayne@68
|
135 assert(orf.strand==strand);
|
jpayne@68
|
136 orf.flip();
|
jpayne@68
|
137 }
|
jpayne@68
|
138 }
|
jpayne@68
|
139 if(list!=null){array[strand].addAll(list);}
|
jpayne@68
|
140 }
|
jpayne@68
|
141 }
|
jpayne@68
|
142 Collections.sort(array[strand]);
|
jpayne@68
|
143 AminoAcid.reverseComplementBasesInPlace(bases);
|
jpayne@68
|
144 }
|
jpayne@68
|
145 return array;
|
jpayne@68
|
146 }
|
jpayne@68
|
147
|
jpayne@68
|
148 /** Designed for quickly calling a single SSU */
|
jpayne@68
|
149 public Orf makeRna(String name, byte[] bases, int type){
|
jpayne@68
|
150 final float[] scores=new float[bases.length];//TODO: Big and slow; make a FloatList?
|
jpayne@68
|
151 StatsContainer sc=pgm.allContainers[type];
|
jpayne@68
|
152 final int[] kmersSeen=(sc.kmerSet()==null ? null : new int[bases.length]);//TODO: IntList?
|
jpayne@68
|
153
|
jpayne@68
|
154 int strand=0;
|
jpayne@68
|
155 ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1);
|
jpayne@68
|
156 final Orf best1=pickBest(list);
|
jpayne@68
|
157 assert(best1==null || best1.start>=0 && best1.stop<bases.length) : bases.length+"\n"+best1;
|
jpayne@68
|
158 if(best1!=null && best1.orfScore>-999){return best1;}
|
jpayne@68
|
159
|
jpayne@68
|
160 strand++;
|
jpayne@68
|
161 AminoAcid.reverseComplementBasesInPlace(bases);
|
jpayne@68
|
162 list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1);
|
jpayne@68
|
163 AminoAcid.reverseComplementBasesInPlace(bases);
|
jpayne@68
|
164 if(strand==1 && list!=null){
|
jpayne@68
|
165 for(Orf orf : list){
|
jpayne@68
|
166 assert(orf.strand==strand);
|
jpayne@68
|
167 orf.flip();
|
jpayne@68
|
168 }
|
jpayne@68
|
169 }
|
jpayne@68
|
170 final Orf best2=pickBest(list);
|
jpayne@68
|
171 assert(best2==null || best2.start>=0 && best2.stop<bases.length) : bases.length+"\n"+best2;
|
jpayne@68
|
172 if(best2!=null && best2.orfScore>-999){return best2;}
|
jpayne@68
|
173 return best1!=null ? best1 : best2;
|
jpayne@68
|
174 }
|
jpayne@68
|
175
|
jpayne@68
|
176 final Orf pickBest(ArrayList<Orf> list){
|
jpayne@68
|
177 if(list==null){return null;}
|
jpayne@68
|
178 Orf best=null;
|
jpayne@68
|
179 for(Orf orf : list){
|
jpayne@68
|
180 if(best==null || orf.orfScore>best.orfScore){
|
jpayne@68
|
181 best=orf;
|
jpayne@68
|
182 }
|
jpayne@68
|
183 }
|
jpayne@68
|
184 return best;
|
jpayne@68
|
185 }
|
jpayne@68
|
186
|
jpayne@68
|
187 /**
|
jpayne@68
|
188 * Generates lists of all max-length non-overlapping Orfs per frame.
|
jpayne@68
|
189 * There IS overlap between frames.
|
jpayne@68
|
190 * All Orfs come out flipped to + orientation.
|
jpayne@68
|
191 * */
|
jpayne@68
|
192 static ArrayList<Orf>[] makeOrfs(String name, byte[] bases, int minlen){
|
jpayne@68
|
193 @SuppressWarnings("unchecked")
|
jpayne@68
|
194 ArrayList<Orf>[] array=new ArrayList[6];
|
jpayne@68
|
195 for(int strand=0; strand<2; strand++){
|
jpayne@68
|
196 for(int frame=0; frame<3; frame++){
|
jpayne@68
|
197 ArrayList<Orf> list=makeOrfsForFrame(name, bases, frame, strand, minlen);
|
jpayne@68
|
198 array[frame+3*strand]=list;
|
jpayne@68
|
199 if(strand==1 && list!=null){
|
jpayne@68
|
200 for(Orf orf : list){
|
jpayne@68
|
201 assert(orf.frame==frame);
|
jpayne@68
|
202 assert(orf.strand==strand);
|
jpayne@68
|
203 orf.flip();
|
jpayne@68
|
204 }
|
jpayne@68
|
205 }
|
jpayne@68
|
206 }
|
jpayne@68
|
207 AminoAcid.reverseComplementBasesInPlace(bases);
|
jpayne@68
|
208 }
|
jpayne@68
|
209 return array;
|
jpayne@68
|
210 }
|
jpayne@68
|
211
|
jpayne@68
|
212 /**
|
jpayne@68
|
213 * Dynamic programming phase.
|
jpayne@68
|
214 * @param frameLists
|
jpayne@68
|
215 * @param bases
|
jpayne@68
|
216 * @return
|
jpayne@68
|
217 */
|
jpayne@68
|
218 private ArrayList<Orf> findPath(ArrayList<Orf>[] frameLists, byte[] bases){
|
jpayne@68
|
219 ArrayList<Orf> all=new ArrayList<Orf>();
|
jpayne@68
|
220 for(ArrayList<Orf> list : frameLists){all.addAll(list);}
|
jpayne@68
|
221 if(all.isEmpty()){return all;}
|
jpayne@68
|
222 Collections.sort(all);
|
jpayne@68
|
223
|
jpayne@68
|
224 for(Orf orf : all){
|
jpayne@68
|
225 orf.pathScorePlus=-999999;
|
jpayne@68
|
226 orf.pathScoreMinus=-999999;
|
jpayne@68
|
227 }
|
jpayne@68
|
228
|
jpayne@68
|
229 int[] lastPositionScored=KillSwitch.allocInt1D(6);
|
jpayne@68
|
230 Arrays.fill(lastPositionScored, -1);
|
jpayne@68
|
231
|
jpayne@68
|
232 //Index of highest-scoring ORF in this frame, with prev on the plus strand
|
jpayne@68
|
233 int[] bestIndexPlus=KillSwitch.allocInt1D(6);
|
jpayne@68
|
234 //Index of highest-scoring ORF in this frame, with prev on the minus strand
|
jpayne@68
|
235 int[] bestIndexMinus=KillSwitch.allocInt1D(6);
|
jpayne@68
|
236 //Highest-scoring ORF in this frame, with prev on the plus strand
|
jpayne@68
|
237 Orf[] bestOrfPlus=new Orf[6];
|
jpayne@68
|
238 //Highest-scoring ORF in this frame, with prev on the minus strand
|
jpayne@68
|
239 Orf[] bestOrfMinus=new Orf[6];
|
jpayne@68
|
240
|
jpayne@68
|
241 int[][] bestIndex=new int[][] {bestIndexPlus, bestIndexMinus};
|
jpayne@68
|
242 Orf[][] bestOrf=new Orf[][] {bestOrfPlus, bestOrfMinus};
|
jpayne@68
|
243
|
jpayne@68
|
244 for(Orf orf : all){
|
jpayne@68
|
245 final int myListNum=3*orf.strand+orf.frame;
|
jpayne@68
|
246 calcPathScore(orf, frameLists, lastPositionScored, bestIndex);
|
jpayne@68
|
247 if(bestOrfPlus[myListNum]==null || orf.pathScorePlus>=bestOrfPlus[myListNum].pathScorePlus){
|
jpayne@68
|
248 bestOrfPlus[myListNum]=orf;
|
jpayne@68
|
249 bestIndexPlus[myListNum]=lastPositionScored[myListNum];
|
jpayne@68
|
250 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf);
|
jpayne@68
|
251 }
|
jpayne@68
|
252 if(bestOrfMinus[myListNum]==null || orf.pathScoreMinus>=bestOrfMinus[myListNum].pathScoreMinus){
|
jpayne@68
|
253 bestOrfMinus[myListNum]=orf;
|
jpayne@68
|
254 bestIndexMinus[myListNum]=lastPositionScored[myListNum];
|
jpayne@68
|
255 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf);
|
jpayne@68
|
256 }
|
jpayne@68
|
257 }
|
jpayne@68
|
258
|
jpayne@68
|
259 Orf best=bestOrf[0][0];
|
jpayne@68
|
260 for(Orf[] array : bestOrf){
|
jpayne@68
|
261 for(Orf orf : array){
|
jpayne@68
|
262 if(best==null || (orf!=null && orf.pathScore()>best.pathScore())){
|
jpayne@68
|
263 best=orf;
|
jpayne@68
|
264 }
|
jpayne@68
|
265 }
|
jpayne@68
|
266 }
|
jpayne@68
|
267 ArrayList<Orf> bestPath=new ArrayList<Orf>();
|
jpayne@68
|
268 for(Orf orf=best; orf!=null; orf=orf.prev()){
|
jpayne@68
|
269 bestPath.add(orf);
|
jpayne@68
|
270 if(orf.type==CDS){geneStartsOut++;}
|
jpayne@68
|
271 else if(orf.type==tRNA){tRNAOut++;}
|
jpayne@68
|
272 else if(orf.type==r16S){r16SOut++;}
|
jpayne@68
|
273 else if(orf.type==r23S){r23SOut++;}
|
jpayne@68
|
274 else if(orf.type==r5S){r5SOut++;}
|
jpayne@68
|
275 else if(orf.type==r18S){r18SOut++;}
|
jpayne@68
|
276 }
|
jpayne@68
|
277 Collections.sort(bestPath);
|
jpayne@68
|
278 return bestPath;
|
jpayne@68
|
279 }
|
jpayne@68
|
280
|
jpayne@68
|
281 /**
|
jpayne@68
|
282 * Calculate the best path to this ORF.
|
jpayne@68
|
283 * @param orf
|
jpayne@68
|
284 * @param frameLists
|
jpayne@68
|
285 * @param lastPositionScored
|
jpayne@68
|
286 * @param bestIndex
|
jpayne@68
|
287 */
|
jpayne@68
|
288 private void calcPathScore(Orf orf, ArrayList<Orf>[] frameLists, int[] lastPositionScored, int[][] bestIndex){
|
jpayne@68
|
289 final int myListNum=3*orf.strand+orf.frame;
|
jpayne@68
|
290
|
jpayne@68
|
291 // System.err.println("* "+orf);
|
jpayne@68
|
292 // System.err.println("* "+Arrays.toString(lastPositionScored));
|
jpayne@68
|
293 // System.err.println();
|
jpayne@68
|
294
|
jpayne@68
|
295 for(int listStrand=0; listStrand<2; listStrand++){
|
jpayne@68
|
296 for(int listFrame=0; listFrame<3; listFrame++){
|
jpayne@68
|
297 int listNum=listFrame+3*listStrand;
|
jpayne@68
|
298 ArrayList<Orf> list=frameLists[listNum];
|
jpayne@68
|
299 int lastPos=lastPositionScored[listNum];
|
jpayne@68
|
300 int bestPos=bestIndex[listStrand][listNum];
|
jpayne@68
|
301 if(listStrand==0){
|
jpayne@68
|
302 calcPathScorePlus(orf, list, listStrand, lastPos, bestPos);
|
jpayne@68
|
303 }else{
|
jpayne@68
|
304 calcPathScoreMinus(orf, list, listStrand, lastPos, bestPos);
|
jpayne@68
|
305 }
|
jpayne@68
|
306 }
|
jpayne@68
|
307 }
|
jpayne@68
|
308
|
jpayne@68
|
309 // System.err.println(myListNum+", "+Arrays.toString(lastPositionScored)+", "+frameLists[myListNum].size());
|
jpayne@68
|
310
|
jpayne@68
|
311 lastPositionScored[myListNum]++;
|
jpayne@68
|
312 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf) : myListNum+"\n"+orf+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum])+"\n"
|
jpayne@68
|
313 +Arrays.toString(lastPositionScored)+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum]+1);
|
jpayne@68
|
314
|
jpayne@68
|
315 //These are sanity checks to make sure that the path did not break in the middle.
|
jpayne@68
|
316 //Safe to disable.
|
jpayne@68
|
317 // assert(orf.prevPlus!=null || orf.stop<100000);
|
jpayne@68
|
318 // assert(orf.prevMinus!=null || orf.stop<100000);
|
jpayne@68
|
319 // assert(orf.pathScore>-10) : orf.pathScore+"\n"+orf+"\n"+orf.prev+"\n";
|
jpayne@68
|
320 }
|
jpayne@68
|
321
|
jpayne@68
|
322 /**
|
jpayne@68
|
323 * Calculate the best path to this ORF from a plus-strand previous ORF.
|
jpayne@68
|
324 * @param orf
|
jpayne@68
|
325 * @param list
|
jpayne@68
|
326 * @param listStrand
|
jpayne@68
|
327 * @param lastPos
|
jpayne@68
|
328 * @param bestPos
|
jpayne@68
|
329 */
|
jpayne@68
|
330 private void calcPathScorePlus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){
|
jpayne@68
|
331 assert(listStrand==0);
|
jpayne@68
|
332 if(lastPos<0){
|
jpayne@68
|
333 if(orf.prevPlus==null){
|
jpayne@68
|
334 orf.pathScorePlus=orf.orfScore;
|
jpayne@68
|
335 orf.pathLengthPlus=1;
|
jpayne@68
|
336 }
|
jpayne@68
|
337 return;
|
jpayne@68
|
338 }
|
jpayne@68
|
339 if(list.isEmpty()){return;}
|
jpayne@68
|
340
|
jpayne@68
|
341 // System.err.println("\nExamining \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame);
|
jpayne@68
|
342 boolean found=false;
|
jpayne@68
|
343 final boolean sameStrand=(orf.strand==listStrand);
|
jpayne@68
|
344 final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand);
|
jpayne@68
|
345 for(int i=lastPos, min=Tools.max(0, bestPos-lookbackPlus); i>=min || (i>0 && !found); i--){
|
jpayne@68
|
346 Orf prev=list.get(i);
|
jpayne@68
|
347 assert(prev!=orf) : prev;
|
jpayne@68
|
348 // System.err.println("Comparing to \t"+prev);
|
jpayne@68
|
349 if(orf.isValidPrev(prev, maxOverlap)){
|
jpayne@68
|
350 int overlap=Tools.max(0, prev.stop-orf.start+1);
|
jpayne@68
|
351 float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap);
|
jpayne@68
|
352
|
jpayne@68
|
353 final float prevScore=prev.pathScore();
|
jpayne@68
|
354 final int prevLength=prev.pathLength();
|
jpayne@68
|
355
|
jpayne@68
|
356 float pathScore;
|
jpayne@68
|
357 final int pathLength;
|
jpayne@68
|
358 if(sameStrand){
|
jpayne@68
|
359 pathLength=prevLength+1;
|
jpayne@68
|
360 pathScore=prevScore+orfScore;
|
jpayne@68
|
361 pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4));
|
jpayne@68
|
362 }else{
|
jpayne@68
|
363 pathLength=1;
|
jpayne@68
|
364 pathScore=prev.pathScore()+orfScore;
|
jpayne@68
|
365 pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5);
|
jpayne@68
|
366 }
|
jpayne@68
|
367
|
jpayne@68
|
368 if(overlap<1 && prevScore>0){found=true;}
|
jpayne@68
|
369 if(pathScore>=orf.pathScorePlus){
|
jpayne@68
|
370 orf.pathScorePlus=pathScore;
|
jpayne@68
|
371 orf.prevPlus=prev;
|
jpayne@68
|
372 orf.pathLengthPlus=pathLength;
|
jpayne@68
|
373 // System.err.println("Set as best");
|
jpayne@68
|
374 }
|
jpayne@68
|
375 }
|
jpayne@68
|
376 if(found && prev.stop<maxOverlap-2000 && orf.prevPlus!=null){
|
jpayne@68
|
377 System.err.println("Breaking");
|
jpayne@68
|
378 break;
|
jpayne@68
|
379 }
|
jpayne@68
|
380 }
|
jpayne@68
|
381 }
|
jpayne@68
|
382
|
jpayne@68
|
383 /**
|
jpayne@68
|
384 * Calculate the best path to this ORF from a minus-strand previous ORF.
|
jpayne@68
|
385 * @param orf
|
jpayne@68
|
386 * @param list
|
jpayne@68
|
387 * @param listStrand
|
jpayne@68
|
388 * @param lastPos
|
jpayne@68
|
389 * @param bestPos
|
jpayne@68
|
390 */
|
jpayne@68
|
391 private void calcPathScoreMinus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){
|
jpayne@68
|
392 assert(listStrand==1);
|
jpayne@68
|
393 if(lastPos<0){
|
jpayne@68
|
394 if(orf.prevMinus==null){
|
jpayne@68
|
395 orf.pathScoreMinus=orf.orfScore;
|
jpayne@68
|
396 orf.pathLengthMinus=1;
|
jpayne@68
|
397 }
|
jpayne@68
|
398 return;
|
jpayne@68
|
399 }
|
jpayne@68
|
400 if(list.isEmpty()){return;}
|
jpayne@68
|
401
|
jpayne@68
|
402 // System.err.println("\nExamining \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame);
|
jpayne@68
|
403 boolean found=false;
|
jpayne@68
|
404 final boolean sameStrand=(orf.strand==listStrand);
|
jpayne@68
|
405 final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand);
|
jpayne@68
|
406 for(int i=lastPos, min=Tools.max(0, bestPos-lookbackMinus); i>=min || (i>0 && !found); i--){
|
jpayne@68
|
407 Orf prev=list.get(i);
|
jpayne@68
|
408 assert(prev!=orf) : prev;
|
jpayne@68
|
409 // System.err.println("Comparing to \t"+prev);
|
jpayne@68
|
410 if(orf.isValidPrev(prev, maxOverlap)){
|
jpayne@68
|
411 int overlap=Tools.max(0, prev.stop-orf.start+1);
|
jpayne@68
|
412 float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap);
|
jpayne@68
|
413
|
jpayne@68
|
414 final float prevScore=prev.pathScore();
|
jpayne@68
|
415 final int prevLength=prev.pathLength();
|
jpayne@68
|
416
|
jpayne@68
|
417 float pathScore;
|
jpayne@68
|
418 final int pathLength;
|
jpayne@68
|
419 if(sameStrand){
|
jpayne@68
|
420 pathLength=prevLength+1;
|
jpayne@68
|
421 pathScore=prevScore+orfScore;
|
jpayne@68
|
422 pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4));
|
jpayne@68
|
423 }else{
|
jpayne@68
|
424 pathLength=1;
|
jpayne@68
|
425 pathScore=prev.pathScore()+orfScore;
|
jpayne@68
|
426 pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5);
|
jpayne@68
|
427 }
|
jpayne@68
|
428 if(overlap<1 && prevScore>0){found=true;}
|
jpayne@68
|
429 if(pathScore>=orf.pathScoreMinus){
|
jpayne@68
|
430 orf.pathScoreMinus=pathScore;
|
jpayne@68
|
431 orf.prevMinus=prev;
|
jpayne@68
|
432 orf.pathLengthMinus=pathLength;
|
jpayne@68
|
433 // System.err.println("Set as best");
|
jpayne@68
|
434 }
|
jpayne@68
|
435 }
|
jpayne@68
|
436 if(found && prev.stop<maxOverlap-2000 && orf.prevMinus!=null){
|
jpayne@68
|
437 System.err.println("Breaking");
|
jpayne@68
|
438 break;
|
jpayne@68
|
439 }
|
jpayne@68
|
440 }
|
jpayne@68
|
441 }
|
jpayne@68
|
442
|
jpayne@68
|
443 /**
|
jpayne@68
|
444 * Generates a list of maximal-length Orfs only (non-overlapping).
|
jpayne@68
|
445 * All Orfs come out in native orientation (unflipped).
|
jpayne@68
|
446 * */
|
jpayne@68
|
447 static ArrayList<Orf> makeOrfsForFrame(String name, byte[] bases, int startFrame, int strand, int minlen){
|
jpayne@68
|
448 // assert(false) : "TODO";
|
jpayne@68
|
449 assert(minlen>=3);
|
jpayne@68
|
450 if(bases==null || bases.length<minlen){return null;}
|
jpayne@68
|
451 ArrayList<Orf> orfs=new ArrayList<Orf>();
|
jpayne@68
|
452 if(!ProkObject.callCDS){return orfs;}
|
jpayne@68
|
453 // int mask=63;
|
jpayne@68
|
454 int code=0;
|
jpayne@68
|
455 int start=-2;
|
jpayne@68
|
456 int frame=0;
|
jpayne@68
|
457 int pos=startFrame;
|
jpayne@68
|
458
|
jpayne@68
|
459
|
jpayne@68
|
460 for(; pos<bases.length; pos++){
|
jpayne@68
|
461 byte b=bases[pos];
|
jpayne@68
|
462 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
463 // code=((code<<2)|x)&mask;
|
jpayne@68
|
464 code=((code<<2)|x);
|
jpayne@68
|
465 frame++;
|
jpayne@68
|
466 if(frame==3){
|
jpayne@68
|
467 frame=0;
|
jpayne@68
|
468 if(start>=0){
|
jpayne@68
|
469 if(GeneModel.isStopCodon(code) || code<0){//NOTE: This adds a stop codon wherever there are Ns.
|
jpayne@68
|
470 int len=pos-start+1;
|
jpayne@68
|
471 if(len>=minlen){
|
jpayne@68
|
472 Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS);
|
jpayne@68
|
473 orfs.add(f);
|
jpayne@68
|
474 }
|
jpayne@68
|
475 start=-1;
|
jpayne@68
|
476 }
|
jpayne@68
|
477 }else{
|
jpayne@68
|
478 if(start==-2 || (start<0 && GeneModel.isStartCodon(code))){
|
jpayne@68
|
479 start=pos-2;
|
jpayne@68
|
480 }
|
jpayne@68
|
481 }
|
jpayne@68
|
482 code=0;
|
jpayne@68
|
483 }
|
jpayne@68
|
484 }
|
jpayne@68
|
485
|
jpayne@68
|
486 //Add a stop codon at the sequence end.
|
jpayne@68
|
487 if(start>=0){
|
jpayne@68
|
488 pos--;
|
jpayne@68
|
489 while(frame!=3 && frame!=-1){
|
jpayne@68
|
490 pos--;
|
jpayne@68
|
491 frame--;
|
jpayne@68
|
492 }
|
jpayne@68
|
493 int len=pos-start+1;
|
jpayne@68
|
494 if(len>=minlen){
|
jpayne@68
|
495 assert(pos<bases.length) : start+", "+pos+", "+bases.length;
|
jpayne@68
|
496 Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS);
|
jpayne@68
|
497 orfs.add(f);
|
jpayne@68
|
498 }
|
jpayne@68
|
499 }
|
jpayne@68
|
500
|
jpayne@68
|
501 return orfs;
|
jpayne@68
|
502 }
|
jpayne@68
|
503
|
jpayne@68
|
504 /**
|
jpayne@68
|
505 * Generates a list of maximal-length RNAs (non-overlapping).
|
jpayne@68
|
506 * All RNAs come out in native orientation (unflipped).
|
jpayne@68
|
507 * */
|
jpayne@68
|
508 ArrayList<Orf> makeRnasForStrand(String name, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen, boolean quitEarly, float bias){
|
jpayne@68
|
509 final int window=sc.lengthAvg;
|
jpayne@68
|
510 if(bases==null || bases.length*2<window){return null;}
|
jpayne@68
|
511 ArrayList<Orf> orfs=new ArrayList<Orf>(sc.type==tRNA ? 32 : 8);
|
jpayne@68
|
512
|
jpayne@68
|
513 final FrameStats inner=sc.inner;
|
jpayne@68
|
514 // final FrameStats start=sc.start;
|
jpayne@68
|
515 // final FrameStats stop=sc.stop;
|
jpayne@68
|
516
|
jpayne@68
|
517 final int k=inner.k;
|
jpayne@68
|
518 final int mask=inner.mask;
|
jpayne@68
|
519 // final float invLen=sc.invLengthAvg;
|
jpayne@68
|
520 final int halfWindow=window/2;
|
jpayne@68
|
521 final int maxWindow=(int)(window*1.5f);
|
jpayne@68
|
522 final int maxWindow2=(int)(window*2.5f);
|
jpayne@68
|
523 // final int slop=Tools.max(50, window/8);
|
jpayne@68
|
524 int len=0;
|
jpayne@68
|
525 int kmer=0;
|
jpayne@68
|
526 float currentScore=0;
|
jpayne@68
|
527 float currentScoreAbs=0;
|
jpayne@68
|
528 bias=(bias>-1 ? bias : biases[sc.type]);
|
jpayne@68
|
529 final float maxBias=biases[sc.type]*1.45f;
|
jpayne@68
|
530
|
jpayne@68
|
531 float thresh=cutoff1[sc.type];
|
jpayne@68
|
532 float prevScore=0;
|
jpayne@68
|
533 int lastStart=0;
|
jpayne@68
|
534
|
jpayne@68
|
535 float max=0;
|
jpayne@68
|
536 int maxPos=0;
|
jpayne@68
|
537
|
jpayne@68
|
538 for(int pos=0; pos<bases.length; pos++){
|
jpayne@68
|
539 final byte b=bases[pos];
|
jpayne@68
|
540 assert(b>=0 && b<128) : "Invalid base b="+((int)b)+"; pos="+pos+"\n"+new String(bases)+"\n";
|
jpayne@68
|
541 final int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
542 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
543
|
jpayne@68
|
544 if(x>=0){
|
jpayne@68
|
545 len++;
|
jpayne@68
|
546 if(len>=k){
|
jpayne@68
|
547 float prob=inner.probs[0][kmer];
|
jpayne@68
|
548 float dif=prob-bias;//Prob above 1 is more likely than average
|
jpayne@68
|
549 currentScoreAbs+=prob;
|
jpayne@68
|
550 currentScore=Tools.max(0, currentScore+dif);
|
jpayne@68
|
551 }
|
jpayne@68
|
552
|
jpayne@68
|
553 if(currentScore>0){
|
jpayne@68
|
554 if(currentScore>max){
|
jpayne@68
|
555 max=currentScore;
|
jpayne@68
|
556 maxPos=pos;
|
jpayne@68
|
557 }
|
jpayne@68
|
558 if(prevScore<=0){
|
jpayne@68
|
559 lastStart=pos;
|
jpayne@68
|
560 }
|
jpayne@68
|
561 }else{
|
jpayne@68
|
562 int rnaLen=maxPos-lastStart;
|
jpayne@68
|
563 if(max>thresh && rnaLen>=halfWindow){
|
jpayne@68
|
564 if(rnaLen>maxWindow){
|
jpayne@68
|
565 if(bias<=maxBias){
|
jpayne@68
|
566 orfs=null;
|
jpayne@68
|
567 float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f);
|
jpayne@68
|
568 return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult);
|
jpayne@68
|
569 }
|
jpayne@68
|
570 }
|
jpayne@68
|
571 if(rnaLen<=maxWindow2){
|
jpayne@68
|
572 Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type);
|
jpayne@68
|
573 orfs.add(orf);
|
jpayne@68
|
574 orf.orfScore=max;
|
jpayne@68
|
575 if(verbose){
|
jpayne@68
|
576 final int start2=(strand==0 ? lastStart : bases.length-maxPos-1);
|
jpayne@68
|
577 final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1);
|
jpayne@68
|
578 System.err.println("Made Orf "+start2+"\t"+stop2+"\t"+max);
|
jpayne@68
|
579 }
|
jpayne@68
|
580 }
|
jpayne@68
|
581 }
|
jpayne@68
|
582 max=0;
|
jpayne@68
|
583 lastStart=pos;
|
jpayne@68
|
584 }
|
jpayne@68
|
585 // System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart);
|
jpayne@68
|
586 prevScore=currentScore;
|
jpayne@68
|
587
|
jpayne@68
|
588 // if(pos>=223000 && pos<232000){
|
jpayne@68
|
589 //// System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart);
|
jpayne@68
|
590 // System.out.println(pos+"\t"+currentScore);
|
jpayne@68
|
591 // }
|
jpayne@68
|
592
|
jpayne@68
|
593 }else{
|
jpayne@68
|
594 len=0;
|
jpayne@68
|
595 kmer=0;
|
jpayne@68
|
596 }
|
jpayne@68
|
597 scores[pos]=currentScoreAbs;
|
jpayne@68
|
598 }
|
jpayne@68
|
599
|
jpayne@68
|
600 // System.err.println("size="+orfs.size()+", type="+Orf.typeStrings[sc.type]);
|
jpayne@68
|
601
|
jpayne@68
|
602
|
jpayne@68
|
603 {
|
jpayne@68
|
604 int rnaLen=maxPos-lastStart;
|
jpayne@68
|
605 if(max>thresh && rnaLen>=halfWindow){
|
jpayne@68
|
606 if(rnaLen>maxWindow){
|
jpayne@68
|
607 if(bias<=maxBias){
|
jpayne@68
|
608 orfs=null;
|
jpayne@68
|
609 float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f);
|
jpayne@68
|
610 return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult);
|
jpayne@68
|
611 }
|
jpayne@68
|
612 }
|
jpayne@68
|
613 if(rnaLen<=maxWindow2){
|
jpayne@68
|
614 Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type);
|
jpayne@68
|
615 orfs.add(orf);
|
jpayne@68
|
616 orf.orfScore=max;
|
jpayne@68
|
617 if(verbose){
|
jpayne@68
|
618 final int start2=(strand==0 ? lastStart : bases.length-maxPos-1);
|
jpayne@68
|
619 final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1);
|
jpayne@68
|
620 System.err.println(start2+"\t"+stop2+"\t"+max);
|
jpayne@68
|
621 }
|
jpayne@68
|
622 }
|
jpayne@68
|
623 }
|
jpayne@68
|
624 }
|
jpayne@68
|
625
|
jpayne@68
|
626 if(kmersSeen!=null && orfs.size()>0 && sc.kmerSet()!=null){
|
jpayne@68
|
627 fillKmersSeen(bases, kmersSeen, sc.kmerSet(), sc.kLongLen());
|
jpayne@68
|
628 }
|
jpayne@68
|
629
|
jpayne@68
|
630 float cutoff=cutoff2[sc.type];
|
jpayne@68
|
631
|
jpayne@68
|
632 for(int i=0; i<orfs.size(); i++){
|
jpayne@68
|
633 Orf orf=orfs.get(i);
|
jpayne@68
|
634 // System.err.println(orf.orfScore);
|
jpayne@68
|
635 boolean good=refineRna(orf, bases, strand, sc, scores, kmersSeen);
|
jpayne@68
|
636 if(orf.orfScore<cutoff || !good){
|
jpayne@68
|
637 if(verbose){System.err.println("REJECT: "+orf.toStringFlipped());}
|
jpayne@68
|
638 orfs.set(i, null);
|
jpayne@68
|
639 }else{
|
jpayne@68
|
640 if(verbose){System.err.println("ACCEPT: "+orf.toStringFlipped());}
|
jpayne@68
|
641 if(quitEarly){
|
jpayne@68
|
642 orfs.clear();
|
jpayne@68
|
643 orfs.add(orf);
|
jpayne@68
|
644 return orfs;
|
jpayne@68
|
645 }
|
jpayne@68
|
646 }
|
jpayne@68
|
647 }
|
jpayne@68
|
648 Tools.condenseStrict(orfs);
|
jpayne@68
|
649
|
jpayne@68
|
650 // assert(false);
|
jpayne@68
|
651
|
jpayne@68
|
652 // for(int pos=0; pos<bases.length; pos++){
|
jpayne@68
|
653 // final byte b=bases[pos];
|
jpayne@68
|
654 // final int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
655 // kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
656 //
|
jpayne@68
|
657 // if(x>=0){
|
jpayne@68
|
658 // len++;
|
jpayne@68
|
659 // if(len>=k){
|
jpayne@68
|
660 // float prob=inner.probs[0][kmer];
|
jpayne@68
|
661 // float dif=prob-1.2f;//Prob above 1 is more likely than average
|
jpayne@68
|
662 // currentScore=Tools.max(0, currentScore+dif);
|
jpayne@68
|
663 // if(currentScore>0){
|
jpayne@68
|
664 // currentStreak++;
|
jpayne@68
|
665 // }else{
|
jpayne@68
|
666 // currentStreak=0;
|
jpayne@68
|
667 // }
|
jpayne@68
|
668 // if(currentScore>200 && currentStreak>1500 && currentStreak<1700){
|
jpayne@68
|
669 // Orf orf=new Orf(name, pos-currentStreak-1, pos, strand, 0, bases, false);
|
jpayne@68
|
670 // orfs.add(orf);
|
jpayne@68
|
671 // orf.orfScore=currentScore;
|
jpayne@68
|
672 // orf.startScore=start.scorePoint(orf.start, bases);
|
jpayne@68
|
673 // orf.stopScore=stop.scorePoint(orf.stop, bases);
|
jpayne@68
|
674 // currentStreak=0;
|
jpayne@68
|
675 // currentScore=0;
|
jpayne@68
|
676 // }
|
jpayne@68
|
677 // }
|
jpayne@68
|
678 // }else{
|
jpayne@68
|
679 // len=0;
|
jpayne@68
|
680 // kmer=0;
|
jpayne@68
|
681 // }
|
jpayne@68
|
682 // }
|
jpayne@68
|
683
|
jpayne@68
|
684 return orfs;
|
jpayne@68
|
685 }
|
jpayne@68
|
686
|
jpayne@68
|
687 void fillKmersSeen(byte[] bases, int[] kmersSeen, LongHashSet set, int k){
|
jpayne@68
|
688 final long mask=~((-1L)<<(2*k));
|
jpayne@68
|
689 long kmer=0;
|
jpayne@68
|
690 int len=0;
|
jpayne@68
|
691 int seen=0;
|
jpayne@68
|
692 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
693 final byte b=bases[i];
|
jpayne@68
|
694 final int num=AminoAcid.baseToNumber[b];
|
jpayne@68
|
695 if(num>=0){
|
jpayne@68
|
696 len++;
|
jpayne@68
|
697 kmer=((kmer<<2)|num)&mask;
|
jpayne@68
|
698 if(len>=k && set.contains(kmer)){seen++;}
|
jpayne@68
|
699 }else{
|
jpayne@68
|
700 len=0;
|
jpayne@68
|
701 }
|
jpayne@68
|
702 kmersSeen[i]=seen;
|
jpayne@68
|
703 }
|
jpayne@68
|
704 }
|
jpayne@68
|
705
|
jpayne@68
|
706 boolean refineRna(Orf orf, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen){
|
jpayne@68
|
707 if(orf==null){return false;}
|
jpayne@68
|
708 if(verbose){System.err.println("REFINE: "+orf.toStringFlipped());}
|
jpayne@68
|
709 final int window=sc.lengthAvg;
|
jpayne@68
|
710 // final int halfWindow=window/2;
|
jpayne@68
|
711 // final int maxWindow=(int)(window*1.5f);
|
jpayne@68
|
712 final int slop=Tools.max(60, 10+window/8);
|
jpayne@68
|
713 final float invWindow=sc.invLengthAvg;
|
jpayne@68
|
714 final float oldScore=orf.orfScore;
|
jpayne@68
|
715 IntList starts=new IntList();
|
jpayne@68
|
716 IntList stops=new IntList();
|
jpayne@68
|
717 FloatList startScores=new FloatList();
|
jpayne@68
|
718 FloatList stopScores=new FloatList();
|
jpayne@68
|
719
|
jpayne@68
|
720 final int leftmost=Tools.max(0, orf.start-slop);
|
jpayne@68
|
721 final int rightmost=Tools.min(bases.length-1, orf.stop+slop);
|
jpayne@68
|
722 if(kmersSeen!=null){
|
jpayne@68
|
723 if(kmersSeen[leftmost]>=kmersSeen[rightmost]){
|
jpayne@68
|
724 // System.err.println("Bad: "+oldScore);
|
jpayne@68
|
725 orf.orfScore=-999;
|
jpayne@68
|
726 return false;
|
jpayne@68
|
727 }else{
|
jpayne@68
|
728 // System.err.println("Good: "+oldScore);
|
jpayne@68
|
729 }
|
jpayne@68
|
730 }
|
jpayne@68
|
731 if(verbose){System.err.println("REFINE2");}
|
jpayne@68
|
732
|
jpayne@68
|
733 {//starts
|
jpayne@68
|
734 final int left=leftmost;
|
jpayne@68
|
735 final int right=Tools.min(bases.length-1, orf.stop+slop-window);
|
jpayne@68
|
736 final float thresh=cutoff3[sc.type];
|
jpayne@68
|
737 fillPoints(left, right, bases, sc.start, thresh, starts, startScores);
|
jpayne@68
|
738 }
|
jpayne@68
|
739 if(verbose){System.err.println("starts: "+starts.size);}
|
jpayne@68
|
740 // if((orf.start+"").startsWith("146") || true){System.err.println(starts);}
|
jpayne@68
|
741 if(verbose){System.err.println(startScores);}
|
jpayne@68
|
742
|
jpayne@68
|
743 {//stops
|
jpayne@68
|
744 final int left=Tools.max(0, orf.start-slop+window);
|
jpayne@68
|
745 final int right=rightmost;
|
jpayne@68
|
746 final float thresh=cutoff4[sc.type];
|
jpayne@68
|
747 fillPoints(left, right, bases, sc.stop, thresh, stops, stopScores);
|
jpayne@68
|
748 }
|
jpayne@68
|
749 if(verbose){System.err.println("stops: "+stops.size);}
|
jpayne@68
|
750 // if((orf.start+"").startsWith("146") || true){System.err.println(stops);}
|
jpayne@68
|
751 if(verbose){System.err.println(stopScores);}
|
jpayne@68
|
752
|
jpayne@68
|
753 final float innerThresh=cutoff5[sc.type];
|
jpayne@68
|
754
|
jpayne@68
|
755 final int minlen=Tools.max(window/2, window-slop);
|
jpayne@68
|
756 final int maxlen=window+slop;
|
jpayne@68
|
757
|
jpayne@68
|
758 orf.orfScore=0;
|
jpayne@68
|
759 int lastStop=0;
|
jpayne@68
|
760 for(int i=0; i<starts.size; i++){
|
jpayne@68
|
761 final int start=starts.get(i);
|
jpayne@68
|
762 final int startSeen=kmersSeen==null ? 0 : kmersSeen[start];
|
jpayne@68
|
763 final float startScore=startScores.get(i);
|
jpayne@68
|
764 // System.err.println("start="+start);
|
jpayne@68
|
765 for(int j=lastStop; j<stops.size; j++){
|
jpayne@68
|
766 final int stop=stops.get(j);
|
jpayne@68
|
767 final int rnalen=stop-start+1;
|
jpayne@68
|
768 // System.err.println("stop="+stop);
|
jpayne@68
|
769 if(rnalen<minlen){
|
jpayne@68
|
770 lastStop=j+1;
|
jpayne@68
|
771 }else if(rnalen>maxlen){
|
jpayne@68
|
772 // System.err.println("broke");
|
jpayne@68
|
773 break;
|
jpayne@68
|
774 }else if(kmersSeen!=null && kmersSeen[stop]<=startSeen){//TODO: Test this
|
jpayne@68
|
775 // //skip
|
jpayne@68
|
776 }else{
|
jpayne@68
|
777 final int len=stop-start+1;
|
jpayne@68
|
778 final float stopScore=stopScores.get(j);
|
jpayne@68
|
779 final float innerScore=(scores[stop]-scores[start])/len;
|
jpayne@68
|
780 // System.err.println("innerScore="+innerScore);
|
jpayne@68
|
781 assert(rnalen<=maxlen) : "start="+start+", stop="+stop+", rnalen="+rnalen+", minlen="+minlen+", maxlen="+maxlen;
|
jpayne@68
|
782 if(innerScore>=innerThresh){
|
jpayne@68
|
783 final float a=Tools.max(startScore+0.25f, 0.25f);
|
jpayne@68
|
784 final float b=Tools.max(stopScore+0.25f, 0.25f);
|
jpayne@68
|
785 final float c=innerScore*innerScore;
|
jpayne@68
|
786 final float d=(window-(2.4f*Tools.absdif(len, window)))*invWindow;
|
jpayne@68
|
787 final float score=c*d*(float)Math.sqrt(a*b);
|
jpayne@68
|
788 if(verbose && score>=0.2f*orf.orfScore){
|
jpayne@68
|
789 final int start2=(strand==0 ? start : bases.length-stop-1);
|
jpayne@68
|
790 final int stop2=(strand==0 ? stop : bases.length-start-1);
|
jpayne@68
|
791 System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore);
|
jpayne@68
|
792 }
|
jpayne@68
|
793 if(score>orf.orfScore){
|
jpayne@68
|
794 orf.start=start;
|
jpayne@68
|
795 orf.stop=stop;
|
jpayne@68
|
796 orf.kmerScore=innerScore*len;
|
jpayne@68
|
797 // if(verbose){
|
jpayne@68
|
798 // final int start2=(strand==0 ? start : bases.length-stop-1);
|
jpayne@68
|
799 // final int stop2=(strand==0 ? stop : bases.length-start-1);
|
jpayne@68
|
800 // System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore);
|
jpayne@68
|
801 // }
|
jpayne@68
|
802 orf.startScore=startScore;
|
jpayne@68
|
803 orf.stopScore=stopScore;
|
jpayne@68
|
804 orf.orfScore=score;
|
jpayne@68
|
805 }
|
jpayne@68
|
806 }else{
|
jpayne@68
|
807 assert(true);
|
jpayne@68
|
808 }
|
jpayne@68
|
809 }
|
jpayne@68
|
810 }
|
jpayne@68
|
811 }
|
jpayne@68
|
812 orf.orfScore*=scoreMult[sc.type];
|
jpayne@68
|
813
|
jpayne@68
|
814 if(starts.isEmpty() || stops.isEmpty()){
|
jpayne@68
|
815 if(verbose){System.err.println("No starts or stops.");}
|
jpayne@68
|
816 orf.orfScore=Tools.min(-999, orf.orfScore-9999);
|
jpayne@68
|
817 return false;
|
jpayne@68
|
818 }
|
jpayne@68
|
819 return refineByAlignment(orf, bases, strand, sc);//Returns true if no consensus is present
|
jpayne@68
|
820 }
|
jpayne@68
|
821
|
jpayne@68
|
822 boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc){
|
jpayne@68
|
823 if(verbose){System.err.println("ALIGN");}
|
jpayne@68
|
824 Read[] consensus=ProkObject.consensusReads(sc.type);
|
jpayne@68
|
825 if(consensus==null || consensus.length==0){return true;}
|
jpayne@68
|
826 boolean refined=false;
|
jpayne@68
|
827 // System.err.println("Initial: "+orf.start+", "+orf.stop);
|
jpayne@68
|
828 for(Read r : consensus){
|
jpayne@68
|
829 // refined=refineByAlignment(orf, bases, strand, sc, r.bases, 15, 15, 2);
|
jpayne@68
|
830 refined=refineByAlignment(orf, bases, strand, sc, r.bases, sc.startSlop(), sc.stopSlop(), 2);
|
jpayne@68
|
831 if(refined || sc.type==r18S || true){break;}
|
jpayne@68
|
832 }
|
jpayne@68
|
833 if(refined){
|
jpayne@68
|
834 if(verbose){System.err.println("Aligned to: "+orf.start+", "+orf.stop);}
|
jpayne@68
|
835 }else{
|
jpayne@68
|
836 if(verbose){System.err.println("Alignment failed.");}
|
jpayne@68
|
837 orf.orfScore=Tools.min(-999, orf.orfScore-9999);
|
jpayne@68
|
838 }
|
jpayne@68
|
839 return refined;
|
jpayne@68
|
840 }
|
jpayne@68
|
841
|
jpayne@68
|
842 boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc, byte[] consensus,
|
jpayne@68
|
843 final int startSlop, final int stopSlop, int recurLimit){
|
jpayne@68
|
844 final int start0=orf.start;
|
jpayne@68
|
845 final int stop0=orf.stop;
|
jpayne@68
|
846
|
jpayne@68
|
847 assert(start0>=0 && start0<bases.length) : start0+", "+stop0;
|
jpayne@68
|
848 assert(stop0>=0 && stop0<bases.length) : start0+", "+stop0;
|
jpayne@68
|
849
|
jpayne@68
|
850 final float minID=sc.minIdentity();
|
jpayne@68
|
851 final int padding=Tools.min(alignmentPadding, 30+sc.lengthAvg/4);
|
jpayne@68
|
852 final int a=Tools.max(0, orf.start-padding);
|
jpayne@68
|
853 final int b=Tools.min(bases.length-1, orf.stop+padding);
|
jpayne@68
|
854 final int reflen=b-a+1;
|
jpayne@68
|
855 if(reflen>10*sc.lengthAvg && reflen>20000){
|
jpayne@68
|
856 System.err.println("Skipped reflen "+reflen+"/"+sc.lengthAvg+" for "
|
jpayne@68
|
857 + "seqlen="+bases.length+", orf="+orf.toString());
|
jpayne@68
|
858 assert(false);
|
jpayne@68
|
859 //TODO: Possibly change return to -1, 0, 1 ("can't align")
|
jpayne@68
|
860 //Should be a limit on window size...
|
jpayne@68
|
861 //Also consider shrinking matrix after jumbo alignments
|
jpayne@68
|
862 return false;
|
jpayne@68
|
863 }
|
jpayne@68
|
864 assert(a>=0 && b<bases.length) : a+", "+b;
|
jpayne@68
|
865 SingleStateAlignerFlat2 ssa=getSSA();
|
jpayne@68
|
866 final int minScore=ssa.minScoreByIdentity(consensus.length, minID);
|
jpayne@68
|
867 int[] max=ssa.fillUnlimited(consensus, bases, a, b, minScore);
|
jpayne@68
|
868 if(max==null){return false;}
|
jpayne@68
|
869
|
jpayne@68
|
870 final int rows=max[0];
|
jpayne@68
|
871 final int maxCol=max[1];
|
jpayne@68
|
872 final int maxState=max[2];
|
jpayne@68
|
873 // final int maxScore=max[3];
|
jpayne@68
|
874 // final int maxStart=max[4];
|
jpayne@68
|
875
|
jpayne@68
|
876 //returns {score, bestRefStart, bestRefStop}
|
jpayne@68
|
877 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
|
jpayne@68
|
878 final int[] score=ssa.score(consensus, bases, a, b, rows, maxCol, maxState);
|
jpayne@68
|
879 final int rstart=Tools.max(score[1], 0);
|
jpayne@68
|
880 final int rstop=Tools.min(score[2], bases.length-1);
|
jpayne@68
|
881 final float id=ssa.tracebackIdentity(consensus, bases, a, b, rows, maxCol, maxState, null);
|
jpayne@68
|
882
|
jpayne@68
|
883 assert(rstart>=0 && rstart<bases.length) : "bases="+bases.length+
|
jpayne@68
|
884 ", rstart="+rstart+", rstop="+rstop+", a="+a+", b="+b+"\n"+"score="+Arrays.toString(score)+", id="+id;
|
jpayne@68
|
885 assert(rstop>=0 && rstop<bases.length) : rstart+", "+rstop;
|
jpayne@68
|
886
|
jpayne@68
|
887 if(score.length>3 && recurLimit>0){
|
jpayne@68
|
888 final int padLeft=score[3];
|
jpayne@68
|
889 final int padRight=score[4];
|
jpayne@68
|
890 //TODO: This takes extra memory. It may be better to cap the width or ignore start0/stop0 here.
|
jpayne@68
|
891 int rstart2=Tools.max(0, Tools.min(start0, rstart)-10);
|
jpayne@68
|
892 int rstop2=Tools.min(bases.length-1, Tools.max(stop0, rstop)+10);
|
jpayne@68
|
893 assert(rstart2>=0 && rstart2<bases.length) : rstart2+", "+rstop2;
|
jpayne@68
|
894 assert(rstop2>=0 && rstop2<bases.length) : rstart2+", "+rstop2;
|
jpayne@68
|
895 orf.start=rstart2;
|
jpayne@68
|
896 orf.stop=rstop2;
|
jpayne@68
|
897 if(orf.start<a || orf.stop>b){
|
jpayne@68
|
898 boolean ret=refineByAlignment(orf, bases, strand, sc, consensus, 0, 0, recurLimit-1);
|
jpayne@68
|
899 if(ret){return true;}
|
jpayne@68
|
900 }
|
jpayne@68
|
901 orf.start=start0;
|
jpayne@68
|
902 orf.stop=stop0;
|
jpayne@68
|
903 // assert(false) : "Don't traceback after recur.";
|
jpayne@68
|
904 }
|
jpayne@68
|
905 if(verbose){
|
jpayne@68
|
906 System.err.println("Identity: "+id);
|
jpayne@68
|
907 }
|
jpayne@68
|
908 // assert(score.length==3) : "TODO: Handle padding requests.";
|
jpayne@68
|
909
|
jpayne@68
|
910 // System.err.println("Identity: "+String.format("%.2f", 100*id)+"; location: "+rstart+"-"+rstop);
|
jpayne@68
|
911 if(id<minID){return false;}
|
jpayne@68
|
912
|
jpayne@68
|
913
|
jpayne@68
|
914 if(Tools.absdif(rstart, start0)>startSlop){orf.start=rstart;}
|
jpayne@68
|
915 if(Tools.absdif(rstop, stop0)>stopSlop){orf.stop=rstop;}
|
jpayne@68
|
916 return true;
|
jpayne@68
|
917 }
|
jpayne@68
|
918
|
jpayne@68
|
919 void fillPoints(final int left, final int right, final byte[] bases, final FrameStats fs, float thresh, final IntList points, final FloatList scores){
|
jpayne@68
|
920 points.clear();
|
jpayne@68
|
921 scores.clear();
|
jpayne@68
|
922 final float minThresh=thresh;//thresh*0.05f;
|
jpayne@68
|
923 // System.err.println(left+", "+right+", "+thresh+", "+minThresh);
|
jpayne@68
|
924 while(points.size()<8 && thresh>=minThresh){
|
jpayne@68
|
925 // System.err.println("Running with thresh="+thresh);
|
jpayne@68
|
926 points.clear();
|
jpayne@68
|
927 scores.clear();
|
jpayne@68
|
928 for(int i=left; i<right; i++){
|
jpayne@68
|
929 float score=fs.scorePoint(i, bases);
|
jpayne@68
|
930 // System.err.println(i+", "+score);
|
jpayne@68
|
931 if(score>=thresh){
|
jpayne@68
|
932 points.add(i);
|
jpayne@68
|
933 scores.add(score);
|
jpayne@68
|
934 }
|
jpayne@68
|
935 }
|
jpayne@68
|
936 thresh=thresh*0.75f;
|
jpayne@68
|
937 }
|
jpayne@68
|
938 }
|
jpayne@68
|
939
|
jpayne@68
|
940 /**
|
jpayne@68
|
941 * Generate all possible genes from each Orf, and return them in a new set of lists.
|
jpayne@68
|
942 * @param frameLists
|
jpayne@68
|
943 * @param bases
|
jpayne@68
|
944 * @return Lists of orfs.
|
jpayne@68
|
945 */
|
jpayne@68
|
946 private ArrayList<Orf>[] breakOrfs(ArrayList<Orf>[] frameLists, byte[] bases){
|
jpayne@68
|
947
|
jpayne@68
|
948 @SuppressWarnings("unchecked")
|
jpayne@68
|
949 ArrayList<Orf>[] brokenLists=new ArrayList[6];
|
jpayne@68
|
950 for(int strand=0; strand<2; strand++){
|
jpayne@68
|
951 for(int frame=0; frame<3; frame++){
|
jpayne@68
|
952 int fnum=frame+3*strand;
|
jpayne@68
|
953 ArrayList<Orf> longest=frameLists[fnum]; //Longest Orf per stop
|
jpayne@68
|
954 ArrayList<Orf> broken=new ArrayList<Orf>(); //All high scoring Orfs, including multiple per stop, with different starts
|
jpayne@68
|
955 if(longest!=null) {
|
jpayne@68
|
956 for(Orf orf : longest){
|
jpayne@68
|
957 assert(orf.frame==frame);
|
jpayne@68
|
958 assert(orf.strand==strand);
|
jpayne@68
|
959 ArrayList<Orf> temp=breakOrf(orf, bases);
|
jpayne@68
|
960 if(temp!=null){
|
jpayne@68
|
961 broken.addAll(temp);
|
jpayne@68
|
962 }
|
jpayne@68
|
963 }
|
jpayne@68
|
964 }
|
jpayne@68
|
965 Collections.sort(broken);
|
jpayne@68
|
966 brokenLists[fnum]=broken;
|
jpayne@68
|
967 }
|
jpayne@68
|
968 //Reverse-complement bases after processing each strand
|
jpayne@68
|
969 AminoAcid.reverseComplementBasesInPlace(bases);
|
jpayne@68
|
970 }
|
jpayne@68
|
971 return brokenLists;
|
jpayne@68
|
972 }
|
jpayne@68
|
973
|
jpayne@68
|
974 /**
|
jpayne@68
|
975 * Generate an Orf for each possible start codon.
|
jpayne@68
|
976 * Retain only the high-scoring ones.
|
jpayne@68
|
977 * @param longest Longest open reading frame for a given stop.
|
jpayne@68
|
978 * @param bases Bases, oriented for this Orf.
|
jpayne@68
|
979 * @return List of Orfs.
|
jpayne@68
|
980 */
|
jpayne@68
|
981 private ArrayList<Orf> breakOrf(Orf longest, byte[] bases){
|
jpayne@68
|
982 assert(longest.start<longest.stop);
|
jpayne@68
|
983 final int flipped=longest.flipped();
|
jpayne@68
|
984 if(flipped==1){longest.flip();}//Now the orf is aligned to its native strand
|
jpayne@68
|
985
|
jpayne@68
|
986 geneStopsMade++;
|
jpayne@68
|
987
|
jpayne@68
|
988 final FrameStats innerStats=pgm.statsCDS.inner;
|
jpayne@68
|
989 final FrameStats startStats=pgm.statsCDS.start;
|
jpayne@68
|
990 final FrameStats stopStats=pgm.statsCDS.stop;
|
jpayne@68
|
991
|
jpayne@68
|
992 final String name=longest.scafName;
|
jpayne@68
|
993 final int start=longest.start;
|
jpayne@68
|
994 final int stop=longest.stop;
|
jpayne@68
|
995 final int strand=longest.strand;
|
jpayne@68
|
996 final int max=Tools.min(longest.stop-2, longest.stop-minLen+4);
|
jpayne@68
|
997
|
jpayne@68
|
998 assert(pgm!=null) : pgm;
|
jpayne@68
|
999 assert(pgm.statsCDS!=null) : pgm;
|
jpayne@68
|
1000 assert(pgm.statsCDS.inner!=null) : pgm.statsCDS;
|
jpayne@68
|
1001 assert(pgm.statsCDS.inner.k>0) : pgm.statsCDS.inner;
|
jpayne@68
|
1002
|
jpayne@68
|
1003 final int k=innerStats.k;
|
jpayne@68
|
1004 final int mask=~((-1)<<(2*k));
|
jpayne@68
|
1005
|
jpayne@68
|
1006 final float stopScore=stopStats.scorePoint(longest.stop, bases);
|
jpayne@68
|
1007 stCds.geneStopScoreSum+=stopScore;
|
jpayne@68
|
1008 stCds.geneStopScoreCount++;
|
jpayne@68
|
1009
|
jpayne@68
|
1010 ArrayList<Orf> broken=new ArrayList<Orf>();
|
jpayne@68
|
1011 int created=0;
|
jpayne@68
|
1012
|
jpayne@68
|
1013 int codon=0;
|
jpayne@68
|
1014 int kmer=0;
|
jpayne@68
|
1015 int len=0;
|
jpayne@68
|
1016 int numKmers=0;
|
jpayne@68
|
1017 float currentScore=0;
|
jpayne@68
|
1018 for(int pos=start, currentFrame=0; pos<=stop; pos++){
|
jpayne@68
|
1019 final byte b=bases[pos];
|
jpayne@68
|
1020 final int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
1021 codon=((codon<<2)|x);
|
jpayne@68
|
1022 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
1023
|
jpayne@68
|
1024 if(x>=0){
|
jpayne@68
|
1025 len++;
|
jpayne@68
|
1026 if(len>=k){
|
jpayne@68
|
1027 float prob=innerStats.probs[currentFrame][kmer];
|
jpayne@68
|
1028 float dif=prob-0.99f;//Prob above 1 is more likely than average
|
jpayne@68
|
1029 currentScore+=dif;
|
jpayne@68
|
1030 // outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", dif)+",\tscore="+String.format(Locale.ROOT, "%.4f", currentScore)+
|
jpayne@68
|
1031 // "\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+
|
jpayne@68
|
1032 // "\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame));
|
jpayne@68
|
1033 }else{
|
jpayne@68
|
1034 // outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", 0.0)+",\tscore="+String.format(Locale.ROOT, "%.4f", 0.0)+
|
jpayne@68
|
1035 // "\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+
|
jpayne@68
|
1036 // "\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame));
|
jpayne@68
|
1037 }
|
jpayne@68
|
1038 }else{
|
jpayne@68
|
1039 len=0;
|
jpayne@68
|
1040 kmer=0;
|
jpayne@68
|
1041 }
|
jpayne@68
|
1042
|
jpayne@68
|
1043 currentFrame++;
|
jpayne@68
|
1044 // outstream.println("pos="+pos+", codon="+AminoAcid.kmerToString(kmer, 3)+", frame="+currentFrame+", start="+start+", isStartCodon="+pgm.isStartCodon(codon));
|
jpayne@68
|
1045 if(currentFrame>2){
|
jpayne@68
|
1046 currentFrame=0;
|
jpayne@68
|
1047 if(pos<max && created<breakLimit && (pos==start+2 || pgm.isStartCodon(codon))){
|
jpayne@68
|
1048 // outstream.println(x);
|
jpayne@68
|
1049 int glen=stop-pos+3;
|
jpayne@68
|
1050 assert(glen>=minLen) : "glen="+glen+", minLen="+minLen+", pos="+pos+", max="+max+", start="+start;
|
jpayne@68
|
1051
|
jpayne@68
|
1052 int oStart=pos-2;
|
jpayne@68
|
1053 float startScore=startStats.scorePoint(oStart, bases);
|
jpayne@68
|
1054
|
jpayne@68
|
1055 stCds.geneStartScoreSum+=startScore;
|
jpayne@68
|
1056 stCds.geneStartScoreCount++;
|
jpayne@68
|
1057
|
jpayne@68
|
1058 stCds.lengthSum+=(stop-(pos-2)+1);
|
jpayne@68
|
1059 stCds.lengthCount++;
|
jpayne@68
|
1060
|
jpayne@68
|
1061 if((startScore>=minStartScore || pos<6) /* && stopScore>=minStopScore /*|| broken.isEmpty()*/){
|
jpayne@68
|
1062 Orf orf=new Orf(name, pos-2, stop, strand, longest.frame, bases, false, longest.type);
|
jpayne@68
|
1063
|
jpayne@68
|
1064 geneStartsMade++;
|
jpayne@68
|
1065 orf.kmerScore=currentScore;
|
jpayne@68
|
1066 orf.startScore=startScore;
|
jpayne@68
|
1067 orf.stopScore=stopScore;
|
jpayne@68
|
1068
|
jpayne@68
|
1069 assert(orf.frame==longest.frame);
|
jpayne@68
|
1070 assert(orf.strand==strand);
|
jpayne@68
|
1071
|
jpayne@68
|
1072 if(strand==1){orf.flip();}
|
jpayne@68
|
1073 broken.add(orf);
|
jpayne@68
|
1074 created++;
|
jpayne@68
|
1075 }
|
jpayne@68
|
1076 }
|
jpayne@68
|
1077 codon=0;
|
jpayne@68
|
1078 }
|
jpayne@68
|
1079 }
|
jpayne@68
|
1080
|
jpayne@68
|
1081 final int size=broken.size();
|
jpayne@68
|
1082 final int sizeCutoff=Tools.max(5, size/2);
|
jpayne@68
|
1083 if(size<1){return broken;}
|
jpayne@68
|
1084 Orf best=broken.get(0);
|
jpayne@68
|
1085 Orf bestStart=broken.get(0);
|
jpayne@68
|
1086 for(int i=0; i<size; i++){
|
jpayne@68
|
1087 Orf orf=broken.get(i);
|
jpayne@68
|
1088
|
jpayne@68
|
1089 //This fixes scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2).
|
jpayne@68
|
1090 orf.kmerScore=currentScore-orf.kmerScore;
|
jpayne@68
|
1091 orf.orfScore=orf.calcOrfScore();
|
jpayne@68
|
1092 if(orf.orfScore>=best.orfScore){best=orf;}
|
jpayne@68
|
1093 if(orf.startScore>=bestStart.startScore){bestStart=orf;}
|
jpayne@68
|
1094
|
jpayne@68
|
1095 stCds.geneInnerScoreSum+=orf.averageKmerScore();
|
jpayne@68
|
1096 stCds.geneInnerScoreCount++;
|
jpayne@68
|
1097 }
|
jpayne@68
|
1098
|
jpayne@68
|
1099 //Sort to by score descending to eliminate low-scoring copies.
|
jpayne@68
|
1100 if(broken.size()>1){Collections.sort(broken, Feature.featureComparatorScore);}
|
jpayne@68
|
1101
|
jpayne@68
|
1102 int removed=0;
|
jpayne@68
|
1103 for(int i=0; i<size; i++){//Fix scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2).
|
jpayne@68
|
1104 Orf orf=broken.get(i);
|
jpayne@68
|
1105 if(orf.averageKmerScore()<minInnerScore || orf.orfScore<minOrfScore ||
|
jpayne@68
|
1106 orf.orfScore/orf.length()<minAvgScore ||
|
jpayne@68
|
1107 orf.orfScore<0.5f*best.orfScore-10 || (orf.startScore<bestStart.startScore-0.55f && orf.kmerScore<best.kmerScore*1.1f && orf!=best)){
|
jpayne@68
|
1108 broken.set(i, null);
|
jpayne@68
|
1109 removed++;
|
jpayne@68
|
1110 }else if(i>sizeCutoff){
|
jpayne@68
|
1111 broken.set(i, null);
|
jpayne@68
|
1112 removed++;
|
jpayne@68
|
1113 }
|
jpayne@68
|
1114 }
|
jpayne@68
|
1115 if(removed>0){
|
jpayne@68
|
1116 Tools.condenseStrict(broken);
|
jpayne@68
|
1117 }
|
jpayne@68
|
1118
|
jpayne@68
|
1119 geneStartsRetained+=broken.size();
|
jpayne@68
|
1120 geneStopsRetained+=(broken.size()>0 ? 1 : 0);
|
jpayne@68
|
1121
|
jpayne@68
|
1122 if(flipped==1){longest.flip();}
|
jpayne@68
|
1123 return broken;
|
jpayne@68
|
1124 }
|
jpayne@68
|
1125
|
jpayne@68
|
1126 /*--------------------------------------------------------------*/
|
jpayne@68
|
1127 /*---------------- Fields ----------------*/
|
jpayne@68
|
1128 /*--------------------------------------------------------------*/
|
jpayne@68
|
1129
|
jpayne@68
|
1130 /**
|
jpayne@68
|
1131 * Current gene model.
|
jpayne@68
|
1132 * TODO: Dynamically swap this as needed for contigs with varying GC.
|
jpayne@68
|
1133 */
|
jpayne@68
|
1134 GeneModel pgm;
|
jpayne@68
|
1135
|
jpayne@68
|
1136 //Gene-calling cutoffs
|
jpayne@68
|
1137 final int minLen;
|
jpayne@68
|
1138 final int maxOverlapSameStrand;
|
jpayne@68
|
1139 final int maxOverlapOppositeStrand;
|
jpayne@68
|
1140 final float minStartScore;
|
jpayne@68
|
1141 final float minStopScore;
|
jpayne@68
|
1142 final float minInnerScore;
|
jpayne@68
|
1143 final float minOrfScore;
|
jpayne@68
|
1144 final float minAvgScore;
|
jpayne@68
|
1145
|
jpayne@68
|
1146 // static float[] cutoff1=new float[] {0, 40f, 200f, 150f, 45f};
|
jpayne@68
|
1147 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 60f};
|
jpayne@68
|
1148 // static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.4f, 1.8f};
|
jpayne@68
|
1149 // static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 0.6f, 1.1f};
|
jpayne@68
|
1150 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.55f};//inner score
|
jpayne@68
|
1151 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f};
|
jpayne@68
|
1152 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f};
|
jpayne@68
|
1153
|
jpayne@68
|
1154 // static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 45f};
|
jpayne@68
|
1155 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f};
|
jpayne@68
|
1156 // static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.3f, 1.8f};
|
jpayne@68
|
1157 // static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.5f, 1.1f};
|
jpayne@68
|
1158 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score
|
jpayne@68
|
1159 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f};
|
jpayne@68
|
1160 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f};
|
jpayne@68
|
1161
|
jpayne@68
|
1162 //for k=4,2,2
|
jpayne@68
|
1163 // static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 40f};
|
jpayne@68
|
1164 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f};
|
jpayne@68
|
1165 // static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.1f, 1.8f};
|
jpayne@68
|
1166 // static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.45f, 1.1f};
|
jpayne@68
|
1167 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score
|
jpayne@68
|
1168 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f};
|
jpayne@68
|
1169 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f};
|
jpayne@68
|
1170
|
jpayne@68
|
1171 // //for k=5,3,3
|
jpayne@68
|
1172 // static float[] cutoff1=new float[] {0, 40f, 300f, 300f, 135f};//sum score
|
jpayne@68
|
1173 // static float[] cutoff2=new float[] {0, 44f, 300f, 400f, 100f};//orf score
|
jpayne@68
|
1174 // static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.5f, 3.5f};//start score
|
jpayne@68
|
1175 // static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 1.4f, 1.8f};//stop score
|
jpayne@68
|
1176 // static float[] cutoff5=new float[] {0, 1.0f, 1.1f, 1.15f, 1.4f};//inner score
|
jpayne@68
|
1177 // static float[] scoreMult=new float[] {1f, 8f, 80f, 120f, 5f};//score mult
|
jpayne@68
|
1178 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f};//bias for sum score
|
jpayne@68
|
1179
|
jpayne@68
|
1180 //for k=6,3,3 - this has low scores for correct 23s stops; may try k=2 or k=4 for that end
|
jpayne@68
|
1181 //Also, 5S seems to score low very for archaea
|
jpayne@68
|
1182 // public static int CDS=0, tRNA=1, /*r16S=2,*/ r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
|
jpayne@68
|
1183 // static float[] cutoff1=new float[] {0, 100f, 600f, 400f, 300f};//sum score
|
jpayne@68
|
1184 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f};//orf score
|
jpayne@68
|
1185 // static float[] cutoff3=new float[] {0, 3.0f, 1.8f, 1.6f, 4.0f};//start score
|
jpayne@68
|
1186 // static float[] cutoff4=new float[] {0, 2.8f, 2.0f, 0.90f, 1.9f};//stop score
|
jpayne@68
|
1187 // static float[] cutoff5=new float[] {0, 2.8f, 1.1f, 1.55f, 1.4f};//inner score
|
jpayne@68
|
1188 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f};//score mult
|
jpayne@68
|
1189 // static float[] biases=new float[] {1f, 1.25f, 1.30f, 1.30f, 1.22f};//16S bias for sum score: 1.25 best for archs, 1.4 best for bacts
|
jpayne@68
|
1190
|
jpayne@68
|
1191 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
|
jpayne@68
|
1192 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 270f, 300f};//sum score
|
jpayne@68
|
1193 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score
|
jpayne@68
|
1194 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 3.8f, 1.65f};//start score
|
jpayne@68
|
1195 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.8f, 1.70f};//stop score
|
jpayne@68
|
1196 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 1.4f, 1.70f};//inner score
|
jpayne@68
|
1197 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f, 35f};//score mult
|
jpayne@68
|
1198 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.30f, 1.30f};
|
jpayne@68
|
1199
|
jpayne@68
|
1200 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
|
jpayne@68
|
1201 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 90f, 300f};//sum score
|
jpayne@68
|
1202 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 24f, 300f};//orf score
|
jpayne@68
|
1203 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 2.0f, 1.65f};//start score
|
jpayne@68
|
1204 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score
|
jpayne@68
|
1205 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score
|
jpayne@68
|
1206 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult
|
jpayne@68
|
1207 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.50f, 1.30f};
|
jpayne@68
|
1208
|
jpayne@68
|
1209 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
|
jpayne@68
|
1210 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 100f, 300f};//sum score
|
jpayne@68
|
1211 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score
|
jpayne@68
|
1212 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 1.8f, 1.65f};//start score
|
jpayne@68
|
1213 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score
|
jpayne@68
|
1214 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score
|
jpayne@68
|
1215 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult
|
jpayne@68
|
1216 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.55f, 1.30f};
|
jpayne@68
|
1217
|
jpayne@68
|
1218 // {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
|
jpayne@68
|
1219 static float[] cutoff1=new float[] {0, 20f, 300f, 400f, 100f, 400f};//sum score
|
jpayne@68
|
1220 static float[] cutoff2=new float[] {0, 36f, 300f, 300f, 32f, 300f};//orf score //tRNA is very sensitive here
|
jpayne@68
|
1221 static float[] cutoff3=new float[] {0, 2.4f, 1.65f, 1.6f, 1.8f, 1.5f};//start score
|
jpayne@68
|
1222 static float[] cutoff4=new float[] {0, 1.5f, 1.70f, 0.90f, 1.0f, 1.1f};//stop score
|
jpayne@68
|
1223 static float[] cutoff5=new float[] {0, 2.2f, 1.70f, 1.55f, 2.6f, 1.5f};//inner score
|
jpayne@68
|
1224 static float[] scoreMult=new float[] {1f, 1.0f, 35f, 80f, 1.25f, 35f};//score mult
|
jpayne@68
|
1225 static float[] biases=new float[] {1f, 1.45f, 1.30f, 1.30f, 1.55f, 1.50f};
|
jpayne@68
|
1226
|
jpayne@68
|
1227 long geneStopsMade=0;
|
jpayne@68
|
1228 long geneStartsMade=0;
|
jpayne@68
|
1229 long geneStartsRetained=0;
|
jpayne@68
|
1230 long geneStopsRetained=0;
|
jpayne@68
|
1231 long geneStartsOut=0;
|
jpayne@68
|
1232
|
jpayne@68
|
1233 long tRNAOut=0;
|
jpayne@68
|
1234 long r16SOut=0;
|
jpayne@68
|
1235 long r23SOut=0;
|
jpayne@68
|
1236 long r5SOut=0;
|
jpayne@68
|
1237 long r18SOut=0;
|
jpayne@68
|
1238
|
jpayne@68
|
1239 ScoreTracker stCds=new ScoreTracker(CDS);
|
jpayne@68
|
1240 ScoreTracker stCds2=new ScoreTracker(CDS);
|
jpayne@68
|
1241 ScoreTracker stCdsPass=new ScoreTracker(CDS);
|
jpayne@68
|
1242 ScoreTracker sttRNA=new ScoreTracker(tRNA);
|
jpayne@68
|
1243 ScoreTracker st16s=new ScoreTracker(r16S);
|
jpayne@68
|
1244 ScoreTracker st23s=new ScoreTracker(r23S);
|
jpayne@68
|
1245 ScoreTracker st5s=new ScoreTracker(r5S);
|
jpayne@68
|
1246 ScoreTracker st18s=new ScoreTracker(r18S);
|
jpayne@68
|
1247
|
jpayne@68
|
1248 ScoreTracker[] trackers=new ScoreTracker[] {stCdsPass, sttRNA, st16s, st23s, st5s, st18s};
|
jpayne@68
|
1249
|
jpayne@68
|
1250 public static SingleStateAlignerFlat2 getSSA(){
|
jpayne@68
|
1251 SingleStateAlignerFlat2 ssa=localSSA.get();
|
jpayne@68
|
1252 if(ssa==null){
|
jpayne@68
|
1253 synchronized(localSSA){
|
jpayne@68
|
1254 ssa=localSSA.get();
|
jpayne@68
|
1255 if(ssa==null){
|
jpayne@68
|
1256 ssa=new SingleStateAlignerFlat2();
|
jpayne@68
|
1257 localSSA.set(ssa);
|
jpayne@68
|
1258 }
|
jpayne@68
|
1259 }
|
jpayne@68
|
1260 }
|
jpayne@68
|
1261 return ssa;
|
jpayne@68
|
1262 }
|
jpayne@68
|
1263
|
jpayne@68
|
1264 public static SingleStateAlignerFlat3 getSSA3(){
|
jpayne@68
|
1265 SingleStateAlignerFlat3 ssa=localSSA3.get();
|
jpayne@68
|
1266 if(ssa==null){
|
jpayne@68
|
1267 synchronized(localSSA3){
|
jpayne@68
|
1268 ssa=localSSA3.get();
|
jpayne@68
|
1269 if(ssa==null){
|
jpayne@68
|
1270 ssa=new SingleStateAlignerFlat3();
|
jpayne@68
|
1271 localSSA3.set(ssa);
|
jpayne@68
|
1272 }
|
jpayne@68
|
1273 }
|
jpayne@68
|
1274 }
|
jpayne@68
|
1275 return ssa;
|
jpayne@68
|
1276 }
|
jpayne@68
|
1277
|
jpayne@68
|
1278 public static SingleStateAlignerFlatFloat getSSAF(){
|
jpayne@68
|
1279 SingleStateAlignerFlatFloat ssa=localSSAF.get();
|
jpayne@68
|
1280 if(ssa==null){
|
jpayne@68
|
1281 synchronized(localSSAF){
|
jpayne@68
|
1282 ssa=localSSAF.get();
|
jpayne@68
|
1283 if(ssa==null){
|
jpayne@68
|
1284 ssa=new SingleStateAlignerFlatFloat();
|
jpayne@68
|
1285 localSSAF.set(ssa);
|
jpayne@68
|
1286 }
|
jpayne@68
|
1287 }
|
jpayne@68
|
1288 }
|
jpayne@68
|
1289 return ssa;
|
jpayne@68
|
1290 }
|
jpayne@68
|
1291
|
jpayne@68
|
1292 private static ThreadLocal<SingleStateAlignerFlat2> localSSA=new ThreadLocal<SingleStateAlignerFlat2>();
|
jpayne@68
|
1293 private static ThreadLocal<SingleStateAlignerFlat3> localSSA3=new ThreadLocal<SingleStateAlignerFlat3>();
|
jpayne@68
|
1294 private static ThreadLocal<SingleStateAlignerFlatFloat> localSSAF=new ThreadLocal<SingleStateAlignerFlatFloat>();
|
jpayne@68
|
1295 // public static int maxAlignmentEndpointDifference=15;
|
jpayne@68
|
1296 public static int alignmentPadding=300;
|
jpayne@68
|
1297
|
jpayne@68
|
1298 public static int breakLimit=12;
|
jpayne@68
|
1299 public static int lookbackPlus=70;
|
jpayne@68
|
1300 public static int lookbackMinus=25;
|
jpayne@68
|
1301
|
jpayne@68
|
1302 // pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4));
|
jpayne@68
|
1303
|
jpayne@68
|
1304 //Operon length modifiers for same strand
|
jpayne@68
|
1305 public static float p0=-30f;
|
jpayne@68
|
1306 public static float p1=-0.35f; //Sensitive - higher decreases FP and TP
|
jpayne@68
|
1307 public static float p2=4.0f;//Insensitive - higher positive decreases FP and TP
|
jpayne@68
|
1308 public static float p3=12f; //Higher decreases FP (substantially) and TP (slightly)
|
jpayne@68
|
1309 public static float p4=-10f; //Lower decreases FP (weakly) and TP (greatly)
|
jpayne@68
|
1310 public static float p5=2.0f; //Insensitive - lower increases FP and TP
|
jpayne@68
|
1311 public static float p6=2f; //Greater increases FP and TP
|
jpayne@68
|
1312
|
jpayne@68
|
1313 // pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5);
|
jpayne@68
|
1314
|
jpayne@68
|
1315 //Operon length modifiers for opposite strand
|
jpayne@68
|
1316 public static float q1=-36f;
|
jpayne@68
|
1317 public static float q2=-1.6f; //q2 and q4 must have opposite signs
|
jpayne@68
|
1318 public static float q3=-12f;
|
jpayne@68
|
1319 public static float q4=3.0f; //(Lower [even negative] decreases FP and TP)
|
jpayne@68
|
1320 public static float q5=-40f;
|
jpayne@68
|
1321
|
jpayne@68
|
1322 private static PrintStream outstream=System.err;
|
jpayne@68
|
1323 static boolean verbose;
|
jpayne@68
|
1324
|
jpayne@68
|
1325 }
|