Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneCaller.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package prok; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 import java.util.Collections; | |
7 | |
8 import aligner.SingleStateAlignerFlat2; | |
9 import aligner.SingleStateAlignerFlat3; | |
10 import aligner.SingleStateAlignerFlatFloat; | |
11 import dna.AminoAcid; | |
12 import shared.KillSwitch; | |
13 import shared.Tools; | |
14 import stream.Read; | |
15 import structures.FloatList; | |
16 import structures.IntList; | |
17 import structures.LongHashSet; | |
18 | |
19 | |
20 /** | |
21 * This class calls genes within a single thread. | |
22 * @author Brian Bushnell | |
23 * @date Sep 24, 2018 | |
24 * | |
25 */ | |
26 public class GeneCaller extends ProkObject { | |
27 | |
28 /*--------------------------------------------------------------*/ | |
29 /*---------------- Init ----------------*/ | |
30 /*--------------------------------------------------------------*/ | |
31 | |
32 GeneCaller(int minLen_, int maxOverlapSameStrand_, int maxOverlapOppositeStrand_, | |
33 float minStartScore_, float minStopScore_, float minInnerScore_, | |
34 float minOrfScore_, float minAvgScore_, GeneModel pgm_){ | |
35 minLen=minLen_; | |
36 maxOverlapSameStrand=maxOverlapSameStrand_; | |
37 maxOverlapOppositeStrand=maxOverlapOppositeStrand_; | |
38 pgm=pgm_; | |
39 | |
40 minStartScore=minStartScore_; | |
41 minStopScore=minStopScore_; | |
42 minInnerScore=minInnerScore_; | |
43 minOrfScore=minOrfScore_; | |
44 minAvgScore=minAvgScore_; | |
45 } | |
46 | |
47 /*--------------------------------------------------------------*/ | |
48 /*---------------- Outer Methods ----------------*/ | |
49 /*--------------------------------------------------------------*/ | |
50 | |
51 public ArrayList<Orf> callGenes(Read r){ | |
52 return callGenes(r, pgm); | |
53 } | |
54 | |
55 public ArrayList<Orf> callGenes(Read r, GeneModel pgm_){ | |
56 pgm=pgm_; | |
57 | |
58 final String name=r.id; | |
59 final byte[] bases=r.bases; | |
60 | |
61 //Lists of all longest orfs per frame | |
62 ArrayList<Orf>[] frameLists=makeOrfs(name, bases, minLen); | |
63 //Lists of all high-scoring orfs per frame, with potentially multiple orfs sharing stops. | |
64 ArrayList<Orf>[] brokenLists=breakOrfs(frameLists, bases); | |
65 | |
66 ArrayList<Orf>[] rnaLists=null; | |
67 final int rlen=r.length(); | |
68 if(calltRNA || (call16S && rlen>800) || (call23S && rlen>1500) || call5S || (call18S && rlen>1000)){ | |
69 rnaLists=makeRnas(name, bases); | |
70 | |
71 brokenLists[0].addAll(rnaLists[0]); | |
72 brokenLists[3].addAll(rnaLists[1]); | |
73 Collections.sort(brokenLists[0]); | |
74 Collections.sort(brokenLists[3]); | |
75 } | |
76 | |
77 boolean printAllOrfs=false; | |
78 boolean printRnas=false; | |
79 if(printAllOrfs){ | |
80 ArrayList<Orf> temp=new ArrayList<Orf>(); | |
81 for(ArrayList<Orf> broken : brokenLists){ | |
82 temp.addAll(broken); | |
83 } | |
84 Collections.sort(temp); | |
85 return temp; | |
86 } | |
87 | |
88 if(printRnas && rnaLists!=null){ | |
89 ArrayList<Orf> temp=new ArrayList<Orf>(); | |
90 for(ArrayList<Orf> list : rnaLists){ | |
91 temp.addAll(list); | |
92 } | |
93 Collections.sort(temp); | |
94 return temp; | |
95 } | |
96 | |
97 stCds2.add(brokenLists); | |
98 | |
99 //Find the optimal path through Orfs | |
100 ArrayList<Orf> path=findPath(brokenLists, bases); | |
101 // geneStartsOut+=path.size(); | |
102 | |
103 if(callCDS){stCdsPass.add(path);} | |
104 if(calltRNA){sttRNA.add(path);} | |
105 if(call16S){st16s.add(path);} | |
106 if(call23S){st23s.add(path);} | |
107 if(call5S){st5s.add(path);} | |
108 if(call18S){st18s.add(path);} | |
109 | |
110 return path; | |
111 } | |
112 | |
113 /*--------------------------------------------------------------*/ | |
114 /*---------------- Inner Methods ----------------*/ | |
115 /*--------------------------------------------------------------*/ | |
116 | |
117 /** | |
118 * Generates lists of all max-length non-overlapping Orfs per frame. | |
119 * There IS overlap between frames. | |
120 * All Orfs come out flipped to + orientation. | |
121 * */ | |
122 ArrayList<Orf>[] makeRnas(String name, byte[] bases){ | |
123 @SuppressWarnings("unchecked") | |
124 ArrayList<Orf>[] array=new ArrayList[2]; | |
125 array[0]=new ArrayList<Orf>(); | |
126 array[1]=new ArrayList<Orf>(); | |
127 final float[] scores=new float[bases.length]; | |
128 final int[] kmersSeen=(lsuKmers==null && ssuKmers==null && trnaKmers==null && r5SKmers==null) ? null : new int[bases.length]; | |
129 for(int strand=0; strand<2; strand++){ | |
130 for(StatsContainer sc : pgm.rnaContainers){ | |
131 if(ProkObject.callType(sc.type)){ | |
132 ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, (sc.kmerSet()==null ? null : kmersSeen), false, -1);//TODO: Make this loop through all RNA types | |
133 if(strand==1 && list!=null){ | |
134 for(Orf orf : list){ | |
135 assert(orf.strand==strand); | |
136 orf.flip(); | |
137 } | |
138 } | |
139 if(list!=null){array[strand].addAll(list);} | |
140 } | |
141 } | |
142 Collections.sort(array[strand]); | |
143 AminoAcid.reverseComplementBasesInPlace(bases); | |
144 } | |
145 return array; | |
146 } | |
147 | |
148 /** Designed for quickly calling a single SSU */ | |
149 public Orf makeRna(String name, byte[] bases, int type){ | |
150 final float[] scores=new float[bases.length];//TODO: Big and slow; make a FloatList? | |
151 StatsContainer sc=pgm.allContainers[type]; | |
152 final int[] kmersSeen=(sc.kmerSet()==null ? null : new int[bases.length]);//TODO: IntList? | |
153 | |
154 int strand=0; | |
155 ArrayList<Orf> list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1); | |
156 final Orf best1=pickBest(list); | |
157 assert(best1==null || best1.start>=0 && best1.stop<bases.length) : bases.length+"\n"+best1; | |
158 if(best1!=null && best1.orfScore>-999){return best1;} | |
159 | |
160 strand++; | |
161 AminoAcid.reverseComplementBasesInPlace(bases); | |
162 list=makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, true, -1); | |
163 AminoAcid.reverseComplementBasesInPlace(bases); | |
164 if(strand==1 && list!=null){ | |
165 for(Orf orf : list){ | |
166 assert(orf.strand==strand); | |
167 orf.flip(); | |
168 } | |
169 } | |
170 final Orf best2=pickBest(list); | |
171 assert(best2==null || best2.start>=0 && best2.stop<bases.length) : bases.length+"\n"+best2; | |
172 if(best2!=null && best2.orfScore>-999){return best2;} | |
173 return best1!=null ? best1 : best2; | |
174 } | |
175 | |
176 final Orf pickBest(ArrayList<Orf> list){ | |
177 if(list==null){return null;} | |
178 Orf best=null; | |
179 for(Orf orf : list){ | |
180 if(best==null || orf.orfScore>best.orfScore){ | |
181 best=orf; | |
182 } | |
183 } | |
184 return best; | |
185 } | |
186 | |
187 /** | |
188 * Generates lists of all max-length non-overlapping Orfs per frame. | |
189 * There IS overlap between frames. | |
190 * All Orfs come out flipped to + orientation. | |
191 * */ | |
192 static ArrayList<Orf>[] makeOrfs(String name, byte[] bases, int minlen){ | |
193 @SuppressWarnings("unchecked") | |
194 ArrayList<Orf>[] array=new ArrayList[6]; | |
195 for(int strand=0; strand<2; strand++){ | |
196 for(int frame=0; frame<3; frame++){ | |
197 ArrayList<Orf> list=makeOrfsForFrame(name, bases, frame, strand, minlen); | |
198 array[frame+3*strand]=list; | |
199 if(strand==1 && list!=null){ | |
200 for(Orf orf : list){ | |
201 assert(orf.frame==frame); | |
202 assert(orf.strand==strand); | |
203 orf.flip(); | |
204 } | |
205 } | |
206 } | |
207 AminoAcid.reverseComplementBasesInPlace(bases); | |
208 } | |
209 return array; | |
210 } | |
211 | |
212 /** | |
213 * Dynamic programming phase. | |
214 * @param frameLists | |
215 * @param bases | |
216 * @return | |
217 */ | |
218 private ArrayList<Orf> findPath(ArrayList<Orf>[] frameLists, byte[] bases){ | |
219 ArrayList<Orf> all=new ArrayList<Orf>(); | |
220 for(ArrayList<Orf> list : frameLists){all.addAll(list);} | |
221 if(all.isEmpty()){return all;} | |
222 Collections.sort(all); | |
223 | |
224 for(Orf orf : all){ | |
225 orf.pathScorePlus=-999999; | |
226 orf.pathScoreMinus=-999999; | |
227 } | |
228 | |
229 int[] lastPositionScored=KillSwitch.allocInt1D(6); | |
230 Arrays.fill(lastPositionScored, -1); | |
231 | |
232 //Index of highest-scoring ORF in this frame, with prev on the plus strand | |
233 int[] bestIndexPlus=KillSwitch.allocInt1D(6); | |
234 //Index of highest-scoring ORF in this frame, with prev on the minus strand | |
235 int[] bestIndexMinus=KillSwitch.allocInt1D(6); | |
236 //Highest-scoring ORF in this frame, with prev on the plus strand | |
237 Orf[] bestOrfPlus=new Orf[6]; | |
238 //Highest-scoring ORF in this frame, with prev on the minus strand | |
239 Orf[] bestOrfMinus=new Orf[6]; | |
240 | |
241 int[][] bestIndex=new int[][] {bestIndexPlus, bestIndexMinus}; | |
242 Orf[][] bestOrf=new Orf[][] {bestOrfPlus, bestOrfMinus}; | |
243 | |
244 for(Orf orf : all){ | |
245 final int myListNum=3*orf.strand+orf.frame; | |
246 calcPathScore(orf, frameLists, lastPositionScored, bestIndex); | |
247 if(bestOrfPlus[myListNum]==null || orf.pathScorePlus>=bestOrfPlus[myListNum].pathScorePlus){ | |
248 bestOrfPlus[myListNum]=orf; | |
249 bestIndexPlus[myListNum]=lastPositionScored[myListNum]; | |
250 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf); | |
251 } | |
252 if(bestOrfMinus[myListNum]==null || orf.pathScoreMinus>=bestOrfMinus[myListNum].pathScoreMinus){ | |
253 bestOrfMinus[myListNum]=orf; | |
254 bestIndexMinus[myListNum]=lastPositionScored[myListNum]; | |
255 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf); | |
256 } | |
257 } | |
258 | |
259 Orf best=bestOrf[0][0]; | |
260 for(Orf[] array : bestOrf){ | |
261 for(Orf orf : array){ | |
262 if(best==null || (orf!=null && orf.pathScore()>best.pathScore())){ | |
263 best=orf; | |
264 } | |
265 } | |
266 } | |
267 ArrayList<Orf> bestPath=new ArrayList<Orf>(); | |
268 for(Orf orf=best; orf!=null; orf=orf.prev()){ | |
269 bestPath.add(orf); | |
270 if(orf.type==CDS){geneStartsOut++;} | |
271 else if(orf.type==tRNA){tRNAOut++;} | |
272 else if(orf.type==r16S){r16SOut++;} | |
273 else if(orf.type==r23S){r23SOut++;} | |
274 else if(orf.type==r5S){r5SOut++;} | |
275 else if(orf.type==r18S){r18SOut++;} | |
276 } | |
277 Collections.sort(bestPath); | |
278 return bestPath; | |
279 } | |
280 | |
281 /** | |
282 * Calculate the best path to this ORF. | |
283 * @param orf | |
284 * @param frameLists | |
285 * @param lastPositionScored | |
286 * @param bestIndex | |
287 */ | |
288 private void calcPathScore(Orf orf, ArrayList<Orf>[] frameLists, int[] lastPositionScored, int[][] bestIndex){ | |
289 final int myListNum=3*orf.strand+orf.frame; | |
290 | |
291 // System.err.println("* "+orf); | |
292 // System.err.println("* "+Arrays.toString(lastPositionScored)); | |
293 // System.err.println(); | |
294 | |
295 for(int listStrand=0; listStrand<2; listStrand++){ | |
296 for(int listFrame=0; listFrame<3; listFrame++){ | |
297 int listNum=listFrame+3*listStrand; | |
298 ArrayList<Orf> list=frameLists[listNum]; | |
299 int lastPos=lastPositionScored[listNum]; | |
300 int bestPos=bestIndex[listStrand][listNum]; | |
301 if(listStrand==0){ | |
302 calcPathScorePlus(orf, list, listStrand, lastPos, bestPos); | |
303 }else{ | |
304 calcPathScoreMinus(orf, list, listStrand, lastPos, bestPos); | |
305 } | |
306 } | |
307 } | |
308 | |
309 // System.err.println(myListNum+", "+Arrays.toString(lastPositionScored)+", "+frameLists[myListNum].size()); | |
310 | |
311 lastPositionScored[myListNum]++; | |
312 assert(frameLists[myListNum].get(lastPositionScored[myListNum])==orf) : myListNum+"\n"+orf+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum])+"\n" | |
313 +Arrays.toString(lastPositionScored)+"\n"+frameLists[myListNum].get(lastPositionScored[myListNum]+1); | |
314 | |
315 //These are sanity checks to make sure that the path did not break in the middle. | |
316 //Safe to disable. | |
317 // assert(orf.prevPlus!=null || orf.stop<100000); | |
318 // assert(orf.prevMinus!=null || orf.stop<100000); | |
319 // assert(orf.pathScore>-10) : orf.pathScore+"\n"+orf+"\n"+orf.prev+"\n"; | |
320 } | |
321 | |
322 /** | |
323 * Calculate the best path to this ORF from a plus-strand previous ORF. | |
324 * @param orf | |
325 * @param list | |
326 * @param listStrand | |
327 * @param lastPos | |
328 * @param bestPos | |
329 */ | |
330 private void calcPathScorePlus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){ | |
331 assert(listStrand==0); | |
332 if(lastPos<0){ | |
333 if(orf.prevPlus==null){ | |
334 orf.pathScorePlus=orf.orfScore; | |
335 orf.pathLengthPlus=1; | |
336 } | |
337 return; | |
338 } | |
339 if(list.isEmpty()){return;} | |
340 | |
341 // System.err.println("\nExamining \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame); | |
342 boolean found=false; | |
343 final boolean sameStrand=(orf.strand==listStrand); | |
344 final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand); | |
345 for(int i=lastPos, min=Tools.max(0, bestPos-lookbackPlus); i>=min || (i>0 && !found); i--){ | |
346 Orf prev=list.get(i); | |
347 assert(prev!=orf) : prev; | |
348 // System.err.println("Comparing to \t"+prev); | |
349 if(orf.isValidPrev(prev, maxOverlap)){ | |
350 int overlap=Tools.max(0, prev.stop-orf.start+1); | |
351 float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap); | |
352 | |
353 final float prevScore=prev.pathScore(); | |
354 final int prevLength=prev.pathLength(); | |
355 | |
356 float pathScore; | |
357 final int pathLength; | |
358 if(sameStrand){ | |
359 pathLength=prevLength+1; | |
360 pathScore=prevScore+orfScore; | |
361 pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4)); | |
362 }else{ | |
363 pathLength=1; | |
364 pathScore=prev.pathScore()+orfScore; | |
365 pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5); | |
366 } | |
367 | |
368 if(overlap<1 && prevScore>0){found=true;} | |
369 if(pathScore>=orf.pathScorePlus){ | |
370 orf.pathScorePlus=pathScore; | |
371 orf.prevPlus=prev; | |
372 orf.pathLengthPlus=pathLength; | |
373 // System.err.println("Set as best"); | |
374 } | |
375 } | |
376 if(found && prev.stop<maxOverlap-2000 && orf.prevPlus!=null){ | |
377 System.err.println("Breaking"); | |
378 break; | |
379 } | |
380 } | |
381 } | |
382 | |
383 /** | |
384 * Calculate the best path to this ORF from a minus-strand previous ORF. | |
385 * @param orf | |
386 * @param list | |
387 * @param listStrand | |
388 * @param lastPos | |
389 * @param bestPos | |
390 */ | |
391 private void calcPathScoreMinus(final Orf orf, final ArrayList<Orf> list, final int listStrand, final int lastPos, final int bestPos){ | |
392 assert(listStrand==1); | |
393 if(lastPos<0){ | |
394 if(orf.prevMinus==null){ | |
395 orf.pathScoreMinus=orf.orfScore; | |
396 orf.pathLengthMinus=1; | |
397 } | |
398 return; | |
399 } | |
400 if(list.isEmpty()){return;} | |
401 | |
402 // System.err.println("\nExamining \t"+orf+"\nlastPos="+lastPos+", bestPos="+bestPos+", sameFrame="+sameFrame); | |
403 boolean found=false; | |
404 final boolean sameStrand=(orf.strand==listStrand); | |
405 final int maxOverlap=(sameStrand ? maxOverlapSameStrand : maxOverlapOppositeStrand); | |
406 for(int i=lastPos, min=Tools.max(0, bestPos-lookbackMinus); i>=min || (i>0 && !found); i--){ | |
407 Orf prev=list.get(i); | |
408 assert(prev!=orf) : prev; | |
409 // System.err.println("Comparing to \t"+prev); | |
410 if(orf.isValidPrev(prev, maxOverlap)){ | |
411 int overlap=Tools.max(0, prev.stop-orf.start+1); | |
412 float orfScore=overlap==0 ? orf.orfScore : orf.calcOrfScore(overlap); | |
413 | |
414 final float prevScore=prev.pathScore(); | |
415 final int prevLength=prev.pathLength(); | |
416 | |
417 float pathScore; | |
418 final int pathLength; | |
419 if(sameStrand){ | |
420 pathLength=prevLength+1; | |
421 pathScore=prevScore+orfScore; | |
422 pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4)); | |
423 }else{ | |
424 pathLength=1; | |
425 pathScore=prev.pathScore()+orfScore; | |
426 pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5); | |
427 } | |
428 if(overlap<1 && prevScore>0){found=true;} | |
429 if(pathScore>=orf.pathScoreMinus){ | |
430 orf.pathScoreMinus=pathScore; | |
431 orf.prevMinus=prev; | |
432 orf.pathLengthMinus=pathLength; | |
433 // System.err.println("Set as best"); | |
434 } | |
435 } | |
436 if(found && prev.stop<maxOverlap-2000 && orf.prevMinus!=null){ | |
437 System.err.println("Breaking"); | |
438 break; | |
439 } | |
440 } | |
441 } | |
442 | |
443 /** | |
444 * Generates a list of maximal-length Orfs only (non-overlapping). | |
445 * All Orfs come out in native orientation (unflipped). | |
446 * */ | |
447 static ArrayList<Orf> makeOrfsForFrame(String name, byte[] bases, int startFrame, int strand, int minlen){ | |
448 // assert(false) : "TODO"; | |
449 assert(minlen>=3); | |
450 if(bases==null || bases.length<minlen){return null;} | |
451 ArrayList<Orf> orfs=new ArrayList<Orf>(); | |
452 if(!ProkObject.callCDS){return orfs;} | |
453 // int mask=63; | |
454 int code=0; | |
455 int start=-2; | |
456 int frame=0; | |
457 int pos=startFrame; | |
458 | |
459 | |
460 for(; pos<bases.length; pos++){ | |
461 byte b=bases[pos]; | |
462 int x=AminoAcid.baseToNumber[b]; | |
463 // code=((code<<2)|x)&mask; | |
464 code=((code<<2)|x); | |
465 frame++; | |
466 if(frame==3){ | |
467 frame=0; | |
468 if(start>=0){ | |
469 if(GeneModel.isStopCodon(code) || code<0){//NOTE: This adds a stop codon wherever there are Ns. | |
470 int len=pos-start+1; | |
471 if(len>=minlen){ | |
472 Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS); | |
473 orfs.add(f); | |
474 } | |
475 start=-1; | |
476 } | |
477 }else{ | |
478 if(start==-2 || (start<0 && GeneModel.isStartCodon(code))){ | |
479 start=pos-2; | |
480 } | |
481 } | |
482 code=0; | |
483 } | |
484 } | |
485 | |
486 //Add a stop codon at the sequence end. | |
487 if(start>=0){ | |
488 pos--; | |
489 while(frame!=3 && frame!=-1){ | |
490 pos--; | |
491 frame--; | |
492 } | |
493 int len=pos-start+1; | |
494 if(len>=minlen){ | |
495 assert(pos<bases.length) : start+", "+pos+", "+bases.length; | |
496 Orf f=new Orf(name, start, pos, strand, startFrame, bases, true, CDS); | |
497 orfs.add(f); | |
498 } | |
499 } | |
500 | |
501 return orfs; | |
502 } | |
503 | |
504 /** | |
505 * Generates a list of maximal-length RNAs (non-overlapping). | |
506 * All RNAs come out in native orientation (unflipped). | |
507 * */ | |
508 ArrayList<Orf> makeRnasForStrand(String name, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen, boolean quitEarly, float bias){ | |
509 final int window=sc.lengthAvg; | |
510 if(bases==null || bases.length*2<window){return null;} | |
511 ArrayList<Orf> orfs=new ArrayList<Orf>(sc.type==tRNA ? 32 : 8); | |
512 | |
513 final FrameStats inner=sc.inner; | |
514 // final FrameStats start=sc.start; | |
515 // final FrameStats stop=sc.stop; | |
516 | |
517 final int k=inner.k; | |
518 final int mask=inner.mask; | |
519 // final float invLen=sc.invLengthAvg; | |
520 final int halfWindow=window/2; | |
521 final int maxWindow=(int)(window*1.5f); | |
522 final int maxWindow2=(int)(window*2.5f); | |
523 // final int slop=Tools.max(50, window/8); | |
524 int len=0; | |
525 int kmer=0; | |
526 float currentScore=0; | |
527 float currentScoreAbs=0; | |
528 bias=(bias>-1 ? bias : biases[sc.type]); | |
529 final float maxBias=biases[sc.type]*1.45f; | |
530 | |
531 float thresh=cutoff1[sc.type]; | |
532 float prevScore=0; | |
533 int lastStart=0; | |
534 | |
535 float max=0; | |
536 int maxPos=0; | |
537 | |
538 for(int pos=0; pos<bases.length; pos++){ | |
539 final byte b=bases[pos]; | |
540 assert(b>=0 && b<128) : "Invalid base b="+((int)b)+"; pos="+pos+"\n"+new String(bases)+"\n"; | |
541 final int x=AminoAcid.baseToNumber[b]; | |
542 kmer=((kmer<<2)|x)&mask; | |
543 | |
544 if(x>=0){ | |
545 len++; | |
546 if(len>=k){ | |
547 float prob=inner.probs[0][kmer]; | |
548 float dif=prob-bias;//Prob above 1 is more likely than average | |
549 currentScoreAbs+=prob; | |
550 currentScore=Tools.max(0, currentScore+dif); | |
551 } | |
552 | |
553 if(currentScore>0){ | |
554 if(currentScore>max){ | |
555 max=currentScore; | |
556 maxPos=pos; | |
557 } | |
558 if(prevScore<=0){ | |
559 lastStart=pos; | |
560 } | |
561 }else{ | |
562 int rnaLen=maxPos-lastStart; | |
563 if(max>thresh && rnaLen>=halfWindow){ | |
564 if(rnaLen>maxWindow){ | |
565 if(bias<=maxBias){ | |
566 orfs=null; | |
567 float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f); | |
568 return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult); | |
569 } | |
570 } | |
571 if(rnaLen<=maxWindow2){ | |
572 Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type); | |
573 orfs.add(orf); | |
574 orf.orfScore=max; | |
575 if(verbose){ | |
576 final int start2=(strand==0 ? lastStart : bases.length-maxPos-1); | |
577 final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1); | |
578 System.err.println("Made Orf "+start2+"\t"+stop2+"\t"+max); | |
579 } | |
580 } | |
581 } | |
582 max=0; | |
583 lastStart=pos; | |
584 } | |
585 // System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart); | |
586 prevScore=currentScore; | |
587 | |
588 // if(pos>=223000 && pos<232000){ | |
589 //// System.err.println("i="+i+"\tscore="+score+"\tmax="+max+"\tmaxPos="+maxPos+"\tprevScore="+prevScore+"\tlastStart="+lastStart); | |
590 // System.out.println(pos+"\t"+currentScore); | |
591 // } | |
592 | |
593 }else{ | |
594 len=0; | |
595 kmer=0; | |
596 } | |
597 scores[pos]=currentScoreAbs; | |
598 } | |
599 | |
600 // System.err.println("size="+orfs.size()+", type="+Orf.typeStrings[sc.type]); | |
601 | |
602 | |
603 { | |
604 int rnaLen=maxPos-lastStart; | |
605 if(max>thresh && rnaLen>=halfWindow){ | |
606 if(rnaLen>maxWindow){ | |
607 if(bias<=maxBias){ | |
608 orfs=null; | |
609 float biasMult=(rnaLen>8*window ? 1.2f : rnaLen>4*window ? 1.1f : 1.05f); | |
610 return makeRnasForStrand(name, bases, strand, sc, scores, kmersSeen, quitEarly, bias*biasMult); | |
611 } | |
612 } | |
613 if(rnaLen<=maxWindow2){ | |
614 Orf orf=new Orf(name, lastStart, maxPos, strand, 0, bases, false, sc.type); | |
615 orfs.add(orf); | |
616 orf.orfScore=max; | |
617 if(verbose){ | |
618 final int start2=(strand==0 ? lastStart : bases.length-maxPos-1); | |
619 final int stop2=(strand==0 ? maxPos : bases.length-lastStart-1); | |
620 System.err.println(start2+"\t"+stop2+"\t"+max); | |
621 } | |
622 } | |
623 } | |
624 } | |
625 | |
626 if(kmersSeen!=null && orfs.size()>0 && sc.kmerSet()!=null){ | |
627 fillKmersSeen(bases, kmersSeen, sc.kmerSet(), sc.kLongLen()); | |
628 } | |
629 | |
630 float cutoff=cutoff2[sc.type]; | |
631 | |
632 for(int i=0; i<orfs.size(); i++){ | |
633 Orf orf=orfs.get(i); | |
634 // System.err.println(orf.orfScore); | |
635 boolean good=refineRna(orf, bases, strand, sc, scores, kmersSeen); | |
636 if(orf.orfScore<cutoff || !good){ | |
637 if(verbose){System.err.println("REJECT: "+orf.toStringFlipped());} | |
638 orfs.set(i, null); | |
639 }else{ | |
640 if(verbose){System.err.println("ACCEPT: "+orf.toStringFlipped());} | |
641 if(quitEarly){ | |
642 orfs.clear(); | |
643 orfs.add(orf); | |
644 return orfs; | |
645 } | |
646 } | |
647 } | |
648 Tools.condenseStrict(orfs); | |
649 | |
650 // assert(false); | |
651 | |
652 // for(int pos=0; pos<bases.length; pos++){ | |
653 // final byte b=bases[pos]; | |
654 // final int x=AminoAcid.baseToNumber[b]; | |
655 // kmer=((kmer<<2)|x)&mask; | |
656 // | |
657 // if(x>=0){ | |
658 // len++; | |
659 // if(len>=k){ | |
660 // float prob=inner.probs[0][kmer]; | |
661 // float dif=prob-1.2f;//Prob above 1 is more likely than average | |
662 // currentScore=Tools.max(0, currentScore+dif); | |
663 // if(currentScore>0){ | |
664 // currentStreak++; | |
665 // }else{ | |
666 // currentStreak=0; | |
667 // } | |
668 // if(currentScore>200 && currentStreak>1500 && currentStreak<1700){ | |
669 // Orf orf=new Orf(name, pos-currentStreak-1, pos, strand, 0, bases, false); | |
670 // orfs.add(orf); | |
671 // orf.orfScore=currentScore; | |
672 // orf.startScore=start.scorePoint(orf.start, bases); | |
673 // orf.stopScore=stop.scorePoint(orf.stop, bases); | |
674 // currentStreak=0; | |
675 // currentScore=0; | |
676 // } | |
677 // } | |
678 // }else{ | |
679 // len=0; | |
680 // kmer=0; | |
681 // } | |
682 // } | |
683 | |
684 return orfs; | |
685 } | |
686 | |
687 void fillKmersSeen(byte[] bases, int[] kmersSeen, LongHashSet set, int k){ | |
688 final long mask=~((-1L)<<(2*k)); | |
689 long kmer=0; | |
690 int len=0; | |
691 int seen=0; | |
692 for(int i=0; i<bases.length; i++){ | |
693 final byte b=bases[i]; | |
694 final int num=AminoAcid.baseToNumber[b]; | |
695 if(num>=0){ | |
696 len++; | |
697 kmer=((kmer<<2)|num)&mask; | |
698 if(len>=k && set.contains(kmer)){seen++;} | |
699 }else{ | |
700 len=0; | |
701 } | |
702 kmersSeen[i]=seen; | |
703 } | |
704 } | |
705 | |
706 boolean refineRna(Orf orf, byte[] bases, int strand, StatsContainer sc, float[] scores, int[] kmersSeen){ | |
707 if(orf==null){return false;} | |
708 if(verbose){System.err.println("REFINE: "+orf.toStringFlipped());} | |
709 final int window=sc.lengthAvg; | |
710 // final int halfWindow=window/2; | |
711 // final int maxWindow=(int)(window*1.5f); | |
712 final int slop=Tools.max(60, 10+window/8); | |
713 final float invWindow=sc.invLengthAvg; | |
714 final float oldScore=orf.orfScore; | |
715 IntList starts=new IntList(); | |
716 IntList stops=new IntList(); | |
717 FloatList startScores=new FloatList(); | |
718 FloatList stopScores=new FloatList(); | |
719 | |
720 final int leftmost=Tools.max(0, orf.start-slop); | |
721 final int rightmost=Tools.min(bases.length-1, orf.stop+slop); | |
722 if(kmersSeen!=null){ | |
723 if(kmersSeen[leftmost]>=kmersSeen[rightmost]){ | |
724 // System.err.println("Bad: "+oldScore); | |
725 orf.orfScore=-999; | |
726 return false; | |
727 }else{ | |
728 // System.err.println("Good: "+oldScore); | |
729 } | |
730 } | |
731 if(verbose){System.err.println("REFINE2");} | |
732 | |
733 {//starts | |
734 final int left=leftmost; | |
735 final int right=Tools.min(bases.length-1, orf.stop+slop-window); | |
736 final float thresh=cutoff3[sc.type]; | |
737 fillPoints(left, right, bases, sc.start, thresh, starts, startScores); | |
738 } | |
739 if(verbose){System.err.println("starts: "+starts.size);} | |
740 // if((orf.start+"").startsWith("146") || true){System.err.println(starts);} | |
741 if(verbose){System.err.println(startScores);} | |
742 | |
743 {//stops | |
744 final int left=Tools.max(0, orf.start-slop+window); | |
745 final int right=rightmost; | |
746 final float thresh=cutoff4[sc.type]; | |
747 fillPoints(left, right, bases, sc.stop, thresh, stops, stopScores); | |
748 } | |
749 if(verbose){System.err.println("stops: "+stops.size);} | |
750 // if((orf.start+"").startsWith("146") || true){System.err.println(stops);} | |
751 if(verbose){System.err.println(stopScores);} | |
752 | |
753 final float innerThresh=cutoff5[sc.type]; | |
754 | |
755 final int minlen=Tools.max(window/2, window-slop); | |
756 final int maxlen=window+slop; | |
757 | |
758 orf.orfScore=0; | |
759 int lastStop=0; | |
760 for(int i=0; i<starts.size; i++){ | |
761 final int start=starts.get(i); | |
762 final int startSeen=kmersSeen==null ? 0 : kmersSeen[start]; | |
763 final float startScore=startScores.get(i); | |
764 // System.err.println("start="+start); | |
765 for(int j=lastStop; j<stops.size; j++){ | |
766 final int stop=stops.get(j); | |
767 final int rnalen=stop-start+1; | |
768 // System.err.println("stop="+stop); | |
769 if(rnalen<minlen){ | |
770 lastStop=j+1; | |
771 }else if(rnalen>maxlen){ | |
772 // System.err.println("broke"); | |
773 break; | |
774 }else if(kmersSeen!=null && kmersSeen[stop]<=startSeen){//TODO: Test this | |
775 // //skip | |
776 }else{ | |
777 final int len=stop-start+1; | |
778 final float stopScore=stopScores.get(j); | |
779 final float innerScore=(scores[stop]-scores[start])/len; | |
780 // System.err.println("innerScore="+innerScore); | |
781 assert(rnalen<=maxlen) : "start="+start+", stop="+stop+", rnalen="+rnalen+", minlen="+minlen+", maxlen="+maxlen; | |
782 if(innerScore>=innerThresh){ | |
783 final float a=Tools.max(startScore+0.25f, 0.25f); | |
784 final float b=Tools.max(stopScore+0.25f, 0.25f); | |
785 final float c=innerScore*innerScore; | |
786 final float d=(window-(2.4f*Tools.absdif(len, window)))*invWindow; | |
787 final float score=c*d*(float)Math.sqrt(a*b); | |
788 if(verbose && score>=0.2f*orf.orfScore){ | |
789 final int start2=(strand==0 ? start : bases.length-stop-1); | |
790 final int stop2=(strand==0 ? stop : bases.length-start-1); | |
791 System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore); | |
792 } | |
793 if(score>orf.orfScore){ | |
794 orf.start=start; | |
795 orf.stop=stop; | |
796 orf.kmerScore=innerScore*len; | |
797 // if(verbose){ | |
798 // final int start2=(strand==0 ? start : bases.length-stop-1); | |
799 // final int stop2=(strand==0 ? stop : bases.length-start-1); | |
800 // System.err.println(start2+"-"+stop2+", "+startScore+", "+stopScore+", "+innerScore+", "+(score*scoreMult[sc.type])+", "+oldScore); | |
801 // } | |
802 orf.startScore=startScore; | |
803 orf.stopScore=stopScore; | |
804 orf.orfScore=score; | |
805 } | |
806 }else{ | |
807 assert(true); | |
808 } | |
809 } | |
810 } | |
811 } | |
812 orf.orfScore*=scoreMult[sc.type]; | |
813 | |
814 if(starts.isEmpty() || stops.isEmpty()){ | |
815 if(verbose){System.err.println("No starts or stops.");} | |
816 orf.orfScore=Tools.min(-999, orf.orfScore-9999); | |
817 return false; | |
818 } | |
819 return refineByAlignment(orf, bases, strand, sc);//Returns true if no consensus is present | |
820 } | |
821 | |
822 boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc){ | |
823 if(verbose){System.err.println("ALIGN");} | |
824 Read[] consensus=ProkObject.consensusReads(sc.type); | |
825 if(consensus==null || consensus.length==0){return true;} | |
826 boolean refined=false; | |
827 // System.err.println("Initial: "+orf.start+", "+orf.stop); | |
828 for(Read r : consensus){ | |
829 // refined=refineByAlignment(orf, bases, strand, sc, r.bases, 15, 15, 2); | |
830 refined=refineByAlignment(orf, bases, strand, sc, r.bases, sc.startSlop(), sc.stopSlop(), 2); | |
831 if(refined || sc.type==r18S || true){break;} | |
832 } | |
833 if(refined){ | |
834 if(verbose){System.err.println("Aligned to: "+orf.start+", "+orf.stop);} | |
835 }else{ | |
836 if(verbose){System.err.println("Alignment failed.");} | |
837 orf.orfScore=Tools.min(-999, orf.orfScore-9999); | |
838 } | |
839 return refined; | |
840 } | |
841 | |
842 boolean refineByAlignment(Orf orf, byte[] bases, int strand, StatsContainer sc, byte[] consensus, | |
843 final int startSlop, final int stopSlop, int recurLimit){ | |
844 final int start0=orf.start; | |
845 final int stop0=orf.stop; | |
846 | |
847 assert(start0>=0 && start0<bases.length) : start0+", "+stop0; | |
848 assert(stop0>=0 && stop0<bases.length) : start0+", "+stop0; | |
849 | |
850 final float minID=sc.minIdentity(); | |
851 final int padding=Tools.min(alignmentPadding, 30+sc.lengthAvg/4); | |
852 final int a=Tools.max(0, orf.start-padding); | |
853 final int b=Tools.min(bases.length-1, orf.stop+padding); | |
854 final int reflen=b-a+1; | |
855 if(reflen>10*sc.lengthAvg && reflen>20000){ | |
856 System.err.println("Skipped reflen "+reflen+"/"+sc.lengthAvg+" for " | |
857 + "seqlen="+bases.length+", orf="+orf.toString()); | |
858 assert(false); | |
859 //TODO: Possibly change return to -1, 0, 1 ("can't align") | |
860 //Should be a limit on window size... | |
861 //Also consider shrinking matrix after jumbo alignments | |
862 return false; | |
863 } | |
864 assert(a>=0 && b<bases.length) : a+", "+b; | |
865 SingleStateAlignerFlat2 ssa=getSSA(); | |
866 final int minScore=ssa.minScoreByIdentity(consensus.length, minID); | |
867 int[] max=ssa.fillUnlimited(consensus, bases, a, b, minScore); | |
868 if(max==null){return false;} | |
869 | |
870 final int rows=max[0]; | |
871 final int maxCol=max[1]; | |
872 final int maxState=max[2]; | |
873 // final int maxScore=max[3]; | |
874 // final int maxStart=max[4]; | |
875 | |
876 //returns {score, bestRefStart, bestRefStop} | |
877 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight}; | |
878 final int[] score=ssa.score(consensus, bases, a, b, rows, maxCol, maxState); | |
879 final int rstart=Tools.max(score[1], 0); | |
880 final int rstop=Tools.min(score[2], bases.length-1); | |
881 final float id=ssa.tracebackIdentity(consensus, bases, a, b, rows, maxCol, maxState, null); | |
882 | |
883 assert(rstart>=0 && rstart<bases.length) : "bases="+bases.length+ | |
884 ", rstart="+rstart+", rstop="+rstop+", a="+a+", b="+b+"\n"+"score="+Arrays.toString(score)+", id="+id; | |
885 assert(rstop>=0 && rstop<bases.length) : rstart+", "+rstop; | |
886 | |
887 if(score.length>3 && recurLimit>0){ | |
888 final int padLeft=score[3]; | |
889 final int padRight=score[4]; | |
890 //TODO: This takes extra memory. It may be better to cap the width or ignore start0/stop0 here. | |
891 int rstart2=Tools.max(0, Tools.min(start0, rstart)-10); | |
892 int rstop2=Tools.min(bases.length-1, Tools.max(stop0, rstop)+10); | |
893 assert(rstart2>=0 && rstart2<bases.length) : rstart2+", "+rstop2; | |
894 assert(rstop2>=0 && rstop2<bases.length) : rstart2+", "+rstop2; | |
895 orf.start=rstart2; | |
896 orf.stop=rstop2; | |
897 if(orf.start<a || orf.stop>b){ | |
898 boolean ret=refineByAlignment(orf, bases, strand, sc, consensus, 0, 0, recurLimit-1); | |
899 if(ret){return true;} | |
900 } | |
901 orf.start=start0; | |
902 orf.stop=stop0; | |
903 // assert(false) : "Don't traceback after recur."; | |
904 } | |
905 if(verbose){ | |
906 System.err.println("Identity: "+id); | |
907 } | |
908 // assert(score.length==3) : "TODO: Handle padding requests."; | |
909 | |
910 // System.err.println("Identity: "+String.format("%.2f", 100*id)+"; location: "+rstart+"-"+rstop); | |
911 if(id<minID){return false;} | |
912 | |
913 | |
914 if(Tools.absdif(rstart, start0)>startSlop){orf.start=rstart;} | |
915 if(Tools.absdif(rstop, stop0)>stopSlop){orf.stop=rstop;} | |
916 return true; | |
917 } | |
918 | |
919 void fillPoints(final int left, final int right, final byte[] bases, final FrameStats fs, float thresh, final IntList points, final FloatList scores){ | |
920 points.clear(); | |
921 scores.clear(); | |
922 final float minThresh=thresh;//thresh*0.05f; | |
923 // System.err.println(left+", "+right+", "+thresh+", "+minThresh); | |
924 while(points.size()<8 && thresh>=minThresh){ | |
925 // System.err.println("Running with thresh="+thresh); | |
926 points.clear(); | |
927 scores.clear(); | |
928 for(int i=left; i<right; i++){ | |
929 float score=fs.scorePoint(i, bases); | |
930 // System.err.println(i+", "+score); | |
931 if(score>=thresh){ | |
932 points.add(i); | |
933 scores.add(score); | |
934 } | |
935 } | |
936 thresh=thresh*0.75f; | |
937 } | |
938 } | |
939 | |
940 /** | |
941 * Generate all possible genes from each Orf, and return them in a new set of lists. | |
942 * @param frameLists | |
943 * @param bases | |
944 * @return Lists of orfs. | |
945 */ | |
946 private ArrayList<Orf>[] breakOrfs(ArrayList<Orf>[] frameLists, byte[] bases){ | |
947 | |
948 @SuppressWarnings("unchecked") | |
949 ArrayList<Orf>[] brokenLists=new ArrayList[6]; | |
950 for(int strand=0; strand<2; strand++){ | |
951 for(int frame=0; frame<3; frame++){ | |
952 int fnum=frame+3*strand; | |
953 ArrayList<Orf> longest=frameLists[fnum]; //Longest Orf per stop | |
954 ArrayList<Orf> broken=new ArrayList<Orf>(); //All high scoring Orfs, including multiple per stop, with different starts | |
955 if(longest!=null) { | |
956 for(Orf orf : longest){ | |
957 assert(orf.frame==frame); | |
958 assert(orf.strand==strand); | |
959 ArrayList<Orf> temp=breakOrf(orf, bases); | |
960 if(temp!=null){ | |
961 broken.addAll(temp); | |
962 } | |
963 } | |
964 } | |
965 Collections.sort(broken); | |
966 brokenLists[fnum]=broken; | |
967 } | |
968 //Reverse-complement bases after processing each strand | |
969 AminoAcid.reverseComplementBasesInPlace(bases); | |
970 } | |
971 return brokenLists; | |
972 } | |
973 | |
974 /** | |
975 * Generate an Orf for each possible start codon. | |
976 * Retain only the high-scoring ones. | |
977 * @param longest Longest open reading frame for a given stop. | |
978 * @param bases Bases, oriented for this Orf. | |
979 * @return List of Orfs. | |
980 */ | |
981 private ArrayList<Orf> breakOrf(Orf longest, byte[] bases){ | |
982 assert(longest.start<longest.stop); | |
983 final int flipped=longest.flipped(); | |
984 if(flipped==1){longest.flip();}//Now the orf is aligned to its native strand | |
985 | |
986 geneStopsMade++; | |
987 | |
988 final FrameStats innerStats=pgm.statsCDS.inner; | |
989 final FrameStats startStats=pgm.statsCDS.start; | |
990 final FrameStats stopStats=pgm.statsCDS.stop; | |
991 | |
992 final String name=longest.scafName; | |
993 final int start=longest.start; | |
994 final int stop=longest.stop; | |
995 final int strand=longest.strand; | |
996 final int max=Tools.min(longest.stop-2, longest.stop-minLen+4); | |
997 | |
998 assert(pgm!=null) : pgm; | |
999 assert(pgm.statsCDS!=null) : pgm; | |
1000 assert(pgm.statsCDS.inner!=null) : pgm.statsCDS; | |
1001 assert(pgm.statsCDS.inner.k>0) : pgm.statsCDS.inner; | |
1002 | |
1003 final int k=innerStats.k; | |
1004 final int mask=~((-1)<<(2*k)); | |
1005 | |
1006 final float stopScore=stopStats.scorePoint(longest.stop, bases); | |
1007 stCds.geneStopScoreSum+=stopScore; | |
1008 stCds.geneStopScoreCount++; | |
1009 | |
1010 ArrayList<Orf> broken=new ArrayList<Orf>(); | |
1011 int created=0; | |
1012 | |
1013 int codon=0; | |
1014 int kmer=0; | |
1015 int len=0; | |
1016 int numKmers=0; | |
1017 float currentScore=0; | |
1018 for(int pos=start, currentFrame=0; pos<=stop; pos++){ | |
1019 final byte b=bases[pos]; | |
1020 final int x=AminoAcid.baseToNumber[b]; | |
1021 codon=((codon<<2)|x); | |
1022 kmer=((kmer<<2)|x)&mask; | |
1023 | |
1024 if(x>=0){ | |
1025 len++; | |
1026 if(len>=k){ | |
1027 float prob=innerStats.probs[currentFrame][kmer]; | |
1028 float dif=prob-0.99f;//Prob above 1 is more likely than average | |
1029 currentScore+=dif; | |
1030 // outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", dif)+",\tscore="+String.format(Locale.ROOT, "%.4f", currentScore)+ | |
1031 // "\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+ | |
1032 // "\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame)); | |
1033 }else{ | |
1034 // outstream.println("pos="+pos+"\tdif="+String.format(Locale.ROOT, "%.4f", 0.0)+",\tscore="+String.format(Locale.ROOT, "%.4f", 0.0)+ | |
1035 // "\tasStart="+String.format(Locale.ROOT, "%.4f", pgm.calcStartScore(pos-2, bases))+"\tasStop="+String.format(Locale.ROOT, "%.4f", stopStats.scorePoint(pos, bases))+ | |
1036 // "\tcodon="+AminoAcid.kmerToString(kmer, 3)+" frame="+(currentFrame)); | |
1037 } | |
1038 }else{ | |
1039 len=0; | |
1040 kmer=0; | |
1041 } | |
1042 | |
1043 currentFrame++; | |
1044 // outstream.println("pos="+pos+", codon="+AminoAcid.kmerToString(kmer, 3)+", frame="+currentFrame+", start="+start+", isStartCodon="+pgm.isStartCodon(codon)); | |
1045 if(currentFrame>2){ | |
1046 currentFrame=0; | |
1047 if(pos<max && created<breakLimit && (pos==start+2 || pgm.isStartCodon(codon))){ | |
1048 // outstream.println(x); | |
1049 int glen=stop-pos+3; | |
1050 assert(glen>=minLen) : "glen="+glen+", minLen="+minLen+", pos="+pos+", max="+max+", start="+start; | |
1051 | |
1052 int oStart=pos-2; | |
1053 float startScore=startStats.scorePoint(oStart, bases); | |
1054 | |
1055 stCds.geneStartScoreSum+=startScore; | |
1056 stCds.geneStartScoreCount++; | |
1057 | |
1058 stCds.lengthSum+=(stop-(pos-2)+1); | |
1059 stCds.lengthCount++; | |
1060 | |
1061 if((startScore>=minStartScore || pos<6) /* && stopScore>=minStopScore /*|| broken.isEmpty()*/){ | |
1062 Orf orf=new Orf(name, pos-2, stop, strand, longest.frame, bases, false, longest.type); | |
1063 | |
1064 geneStartsMade++; | |
1065 orf.kmerScore=currentScore; | |
1066 orf.startScore=startScore; | |
1067 orf.stopScore=stopScore; | |
1068 | |
1069 assert(orf.frame==longest.frame); | |
1070 assert(orf.strand==strand); | |
1071 | |
1072 if(strand==1){orf.flip();} | |
1073 broken.add(orf); | |
1074 created++; | |
1075 } | |
1076 } | |
1077 codon=0; | |
1078 } | |
1079 } | |
1080 | |
1081 final int size=broken.size(); | |
1082 final int sizeCutoff=Tools.max(5, size/2); | |
1083 if(size<1){return broken;} | |
1084 Orf best=broken.get(0); | |
1085 Orf bestStart=broken.get(0); | |
1086 for(int i=0; i<size; i++){ | |
1087 Orf orf=broken.get(i); | |
1088 | |
1089 //This fixes scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2). | |
1090 orf.kmerScore=currentScore-orf.kmerScore; | |
1091 orf.orfScore=orf.calcOrfScore(); | |
1092 if(orf.orfScore>=best.orfScore){best=orf;} | |
1093 if(orf.startScore>=bestStart.startScore){bestStart=orf;} | |
1094 | |
1095 stCds.geneInnerScoreSum+=orf.averageKmerScore(); | |
1096 stCds.geneInnerScoreCount++; | |
1097 } | |
1098 | |
1099 //Sort to by score descending to eliminate low-scoring copies. | |
1100 if(broken.size()>1){Collections.sort(broken, Feature.featureComparatorScore);} | |
1101 | |
1102 int removed=0; | |
1103 for(int i=0; i<size; i++){//Fix scores because they were generated together, from start to stop, to make this O(N) instead of O(N^2). | |
1104 Orf orf=broken.get(i); | |
1105 if(orf.averageKmerScore()<minInnerScore || orf.orfScore<minOrfScore || | |
1106 orf.orfScore/orf.length()<minAvgScore || | |
1107 orf.orfScore<0.5f*best.orfScore-10 || (orf.startScore<bestStart.startScore-0.55f && orf.kmerScore<best.kmerScore*1.1f && orf!=best)){ | |
1108 broken.set(i, null); | |
1109 removed++; | |
1110 }else if(i>sizeCutoff){ | |
1111 broken.set(i, null); | |
1112 removed++; | |
1113 } | |
1114 } | |
1115 if(removed>0){ | |
1116 Tools.condenseStrict(broken); | |
1117 } | |
1118 | |
1119 geneStartsRetained+=broken.size(); | |
1120 geneStopsRetained+=(broken.size()>0 ? 1 : 0); | |
1121 | |
1122 if(flipped==1){longest.flip();} | |
1123 return broken; | |
1124 } | |
1125 | |
1126 /*--------------------------------------------------------------*/ | |
1127 /*---------------- Fields ----------------*/ | |
1128 /*--------------------------------------------------------------*/ | |
1129 | |
1130 /** | |
1131 * Current gene model. | |
1132 * TODO: Dynamically swap this as needed for contigs with varying GC. | |
1133 */ | |
1134 GeneModel pgm; | |
1135 | |
1136 //Gene-calling cutoffs | |
1137 final int minLen; | |
1138 final int maxOverlapSameStrand; | |
1139 final int maxOverlapOppositeStrand; | |
1140 final float minStartScore; | |
1141 final float minStopScore; | |
1142 final float minInnerScore; | |
1143 final float minOrfScore; | |
1144 final float minAvgScore; | |
1145 | |
1146 // static float[] cutoff1=new float[] {0, 40f, 200f, 150f, 45f}; | |
1147 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 60f}; | |
1148 // static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.4f, 1.8f}; | |
1149 // static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 0.6f, 1.1f}; | |
1150 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.55f};//inner score | |
1151 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f}; | |
1152 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f}; | |
1153 | |
1154 // static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 45f}; | |
1155 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f}; | |
1156 // static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.3f, 1.8f}; | |
1157 // static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.5f, 1.1f}; | |
1158 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score | |
1159 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f}; | |
1160 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.15f}; | |
1161 | |
1162 //for k=4,2,2 | |
1163 // static float[] cutoff1=new float[] {0, 40f, 140f, 150f, 40f}; | |
1164 // static float[] cutoff2=new float[] {0, 44f, 300f, 150f, 45f}; | |
1165 // static float[] cutoff3=new float[] {0, 1.7f, 1.1f, 1.1f, 1.8f}; | |
1166 // static float[] cutoff4=new float[] {0, 1.6f, 1.3f, 0.45f, 1.1f}; | |
1167 // static float[] cutoff5=new float[] {0, 1.0f, 1.0f, 1.0f, 1.3f};//inner score | |
1168 // static float[] scoreMult=new float[] {1f, 8f, 200f, 175f, 12f}; | |
1169 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f}; | |
1170 | |
1171 // //for k=5,3,3 | |
1172 // static float[] cutoff1=new float[] {0, 40f, 300f, 300f, 135f};//sum score | |
1173 // static float[] cutoff2=new float[] {0, 44f, 300f, 400f, 100f};//orf score | |
1174 // static float[] cutoff3=new float[] {0, 1.7f, 1.5f, 1.5f, 3.5f};//start score | |
1175 // static float[] cutoff4=new float[] {0, 1.6f, 1.5f, 1.4f, 1.8f};//stop score | |
1176 // static float[] cutoff5=new float[] {0, 1.0f, 1.1f, 1.15f, 1.4f};//inner score | |
1177 // static float[] scoreMult=new float[] {1f, 8f, 80f, 120f, 5f};//score mult | |
1178 // static float[] biases=new float[] {1f, 1.17f, 1.2f, 1.2f, 1.22f};//bias for sum score | |
1179 | |
1180 //for k=6,3,3 - this has low scores for correct 23s stops; may try k=2 or k=4 for that end | |
1181 //Also, 5S seems to score low very for archaea | |
1182 // public static int CDS=0, tRNA=1, /*r16S=2,*/ r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; | |
1183 // static float[] cutoff1=new float[] {0, 100f, 600f, 400f, 300f};//sum score | |
1184 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f};//orf score | |
1185 // static float[] cutoff3=new float[] {0, 3.0f, 1.8f, 1.6f, 4.0f};//start score | |
1186 // static float[] cutoff4=new float[] {0, 2.8f, 2.0f, 0.90f, 1.9f};//stop score | |
1187 // static float[] cutoff5=new float[] {0, 2.8f, 1.1f, 1.55f, 1.4f};//inner score | |
1188 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f};//score mult | |
1189 // static float[] biases=new float[] {1f, 1.25f, 1.30f, 1.30f, 1.22f};//16S bias for sum score: 1.25 best for archs, 1.4 best for bacts | |
1190 | |
1191 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; | |
1192 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 270f, 300f};//sum score | |
1193 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score | |
1194 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 3.8f, 1.65f};//start score | |
1195 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.8f, 1.70f};//stop score | |
1196 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 1.4f, 1.70f};//inner score | |
1197 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.0f, 35f};//score mult | |
1198 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.30f, 1.30f}; | |
1199 | |
1200 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; | |
1201 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 90f, 300f};//sum score | |
1202 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 24f, 300f};//orf score | |
1203 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 2.0f, 1.65f};//start score | |
1204 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score | |
1205 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score | |
1206 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult | |
1207 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.50f, 1.30f}; | |
1208 | |
1209 //// {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; | |
1210 // static float[] cutoff1=new float[] {0, 90f, 300f, 400f, 100f, 300f};//sum score | |
1211 // static float[] cutoff2=new float[] {0, 48f, 300f, 300f, 32f, 300f};//orf score | |
1212 // static float[] cutoff3=new float[] {0, 2.8f, 1.65f, 1.6f, 1.8f, 1.65f};//start score | |
1213 // static float[] cutoff4=new float[] {0, 2.6f, 1.70f, 0.90f, 1.0f, 1.70f};//stop score | |
1214 // static float[] cutoff5=new float[] {0, 2.8f, 1.70f, 1.55f, 2.6f, 1.70f};//inner score | |
1215 // static float[] scoreMult=new float[] {1f, 1f, 35f, 80f, 1.25f, 35f};//score mult | |
1216 // static float[] biases=new float[] {1f, 1.50f, 1.30f, 1.30f, 1.55f, 1.30f}; | |
1217 | |
1218 // {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; | |
1219 static float[] cutoff1=new float[] {0, 20f, 300f, 400f, 100f, 400f};//sum score | |
1220 static float[] cutoff2=new float[] {0, 36f, 300f, 300f, 32f, 300f};//orf score //tRNA is very sensitive here | |
1221 static float[] cutoff3=new float[] {0, 2.4f, 1.65f, 1.6f, 1.8f, 1.5f};//start score | |
1222 static float[] cutoff4=new float[] {0, 1.5f, 1.70f, 0.90f, 1.0f, 1.1f};//stop score | |
1223 static float[] cutoff5=new float[] {0, 2.2f, 1.70f, 1.55f, 2.6f, 1.5f};//inner score | |
1224 static float[] scoreMult=new float[] {1f, 1.0f, 35f, 80f, 1.25f, 35f};//score mult | |
1225 static float[] biases=new float[] {1f, 1.45f, 1.30f, 1.30f, 1.55f, 1.50f}; | |
1226 | |
1227 long geneStopsMade=0; | |
1228 long geneStartsMade=0; | |
1229 long geneStartsRetained=0; | |
1230 long geneStopsRetained=0; | |
1231 long geneStartsOut=0; | |
1232 | |
1233 long tRNAOut=0; | |
1234 long r16SOut=0; | |
1235 long r23SOut=0; | |
1236 long r5SOut=0; | |
1237 long r18SOut=0; | |
1238 | |
1239 ScoreTracker stCds=new ScoreTracker(CDS); | |
1240 ScoreTracker stCds2=new ScoreTracker(CDS); | |
1241 ScoreTracker stCdsPass=new ScoreTracker(CDS); | |
1242 ScoreTracker sttRNA=new ScoreTracker(tRNA); | |
1243 ScoreTracker st16s=new ScoreTracker(r16S); | |
1244 ScoreTracker st23s=new ScoreTracker(r23S); | |
1245 ScoreTracker st5s=new ScoreTracker(r5S); | |
1246 ScoreTracker st18s=new ScoreTracker(r18S); | |
1247 | |
1248 ScoreTracker[] trackers=new ScoreTracker[] {stCdsPass, sttRNA, st16s, st23s, st5s, st18s}; | |
1249 | |
1250 public static SingleStateAlignerFlat2 getSSA(){ | |
1251 SingleStateAlignerFlat2 ssa=localSSA.get(); | |
1252 if(ssa==null){ | |
1253 synchronized(localSSA){ | |
1254 ssa=localSSA.get(); | |
1255 if(ssa==null){ | |
1256 ssa=new SingleStateAlignerFlat2(); | |
1257 localSSA.set(ssa); | |
1258 } | |
1259 } | |
1260 } | |
1261 return ssa; | |
1262 } | |
1263 | |
1264 public static SingleStateAlignerFlat3 getSSA3(){ | |
1265 SingleStateAlignerFlat3 ssa=localSSA3.get(); | |
1266 if(ssa==null){ | |
1267 synchronized(localSSA3){ | |
1268 ssa=localSSA3.get(); | |
1269 if(ssa==null){ | |
1270 ssa=new SingleStateAlignerFlat3(); | |
1271 localSSA3.set(ssa); | |
1272 } | |
1273 } | |
1274 } | |
1275 return ssa; | |
1276 } | |
1277 | |
1278 public static SingleStateAlignerFlatFloat getSSAF(){ | |
1279 SingleStateAlignerFlatFloat ssa=localSSAF.get(); | |
1280 if(ssa==null){ | |
1281 synchronized(localSSAF){ | |
1282 ssa=localSSAF.get(); | |
1283 if(ssa==null){ | |
1284 ssa=new SingleStateAlignerFlatFloat(); | |
1285 localSSAF.set(ssa); | |
1286 } | |
1287 } | |
1288 } | |
1289 return ssa; | |
1290 } | |
1291 | |
1292 private static ThreadLocal<SingleStateAlignerFlat2> localSSA=new ThreadLocal<SingleStateAlignerFlat2>(); | |
1293 private static ThreadLocal<SingleStateAlignerFlat3> localSSA3=new ThreadLocal<SingleStateAlignerFlat3>(); | |
1294 private static ThreadLocal<SingleStateAlignerFlatFloat> localSSAF=new ThreadLocal<SingleStateAlignerFlatFloat>(); | |
1295 // public static int maxAlignmentEndpointDifference=15; | |
1296 public static int alignmentPadding=300; | |
1297 | |
1298 public static int breakLimit=12; | |
1299 public static int lookbackPlus=70; | |
1300 public static int lookbackMinus=25; | |
1301 | |
1302 // pathScore+=p0+p1*(Tools.mid(p5*(p2+pathLength), p6*(p3-pathLength), p4)); | |
1303 | |
1304 //Operon length modifiers for same strand | |
1305 public static float p0=-30f; | |
1306 public static float p1=-0.35f; //Sensitive - higher decreases FP and TP | |
1307 public static float p2=4.0f;//Insensitive - higher positive decreases FP and TP | |
1308 public static float p3=12f; //Higher decreases FP (substantially) and TP (slightly) | |
1309 public static float p4=-10f; //Lower decreases FP (weakly) and TP (greatly) | |
1310 public static float p5=2.0f; //Insensitive - lower increases FP and TP | |
1311 public static float p6=2f; //Greater increases FP and TP | |
1312 | |
1313 // pathScore+=q1+Tools.mid(q2*prevLength, q3+q4*prevLength, q5); | |
1314 | |
1315 //Operon length modifiers for opposite strand | |
1316 public static float q1=-36f; | |
1317 public static float q2=-1.6f; //q2 and q4 must have opposite signs | |
1318 public static float q3=-12f; | |
1319 public static float q4=3.0f; //(Lower [even negative] decreases FP and TP) | |
1320 public static float q5=-40f; | |
1321 | |
1322 private static PrintStream outstream=System.err; | |
1323 static boolean verbose; | |
1324 | |
1325 } |