Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package prok; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.BitSet; | |
6 import java.util.HashMap; | |
7 | |
8 import aligner.SingleStateAlignerFlat2; | |
9 import dna.AminoAcid; | |
10 import fileIO.FileFormat; | |
11 import gff.GffLine; | |
12 import shared.KillSwitch; | |
13 import shared.Shared; | |
14 import shared.Tools; | |
15 import stream.Read; | |
16 import stream.ReadInputStream; | |
17 import structures.ByteBuilder; | |
18 import structures.IntList; | |
19 | |
20 /** | |
21 * This class is designed to store kmer frequencies related to gene | |
22 * starts, stops, and interiors. It can be loaded from a pgm file. | |
23 * | |
24 * It's possible to use multiple GeneModels; for example, one for | |
25 * each of several GC ranges or clades. | |
26 * @author Brian Bushnell | |
27 * @date Sep 24, 2018 | |
28 * | |
29 */ | |
30 public class GeneModel extends ProkObject { | |
31 | |
32 /*--------------------------------------------------------------*/ | |
33 /*---------------- Initialization ----------------*/ | |
34 /*--------------------------------------------------------------*/ | |
35 | |
36 public GeneModel(boolean fill){ | |
37 if(fill){ | |
38 fillContainers(); | |
39 } | |
40 } | |
41 | |
42 void fillContainers(){ | |
43 statsCDS.setInner(kInnerCDS, 3); | |
44 statsCDS.setStart(kStartCDS, startFrames, startLeftOffset); | |
45 statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset); | |
46 | |
47 for(int i=0; i<rnaContainers.length; i++){ | |
48 StatsContainer sc=rnaContainers[i]; | |
49 sc.setInner(kInnerRNA, 1); | |
50 } | |
51 | |
52 statstRNA.setStart(kStartRNA, 14, 4); | |
53 statstRNA.setStop(kStopRNA, 14, 6); | |
54 | |
55 stats16S.setStart(kStartRNA, 20, 7); | |
56 stats16S.setStop(kStopRNA, 12, 16); | |
57 | |
58 stats23S.setStart(kStartRNA, 17, 3); | |
59 stats23S.setStop(kStopRNA, 15, 12); | |
60 | |
61 stats5S.setStart(kStartRNA, 20, 5); | |
62 stats5S.setStop(kStopRNA, 15, 5); | |
63 | |
64 stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined | |
65 stats18S.setStop(kStopRNA, 12, 16); | |
66 } | |
67 | |
68 /*--------------------------------------------------------------*/ | |
69 /*---------------- Public Methods ----------------*/ | |
70 /*--------------------------------------------------------------*/ | |
71 | |
72 public boolean process(String genomeFname, String gffFname){ | |
73 // fnames.add(ReadWrite.stripPath(genomeFname)); | |
74 numFiles++; | |
75 FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true); | |
76 FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true); | |
77 | |
78 if(fnaFile==null || gffFile==null){ | |
79 errorState=true; | |
80 return true; | |
81 } | |
82 filesProcessed++; | |
83 | |
84 ArrayList<ScafData> scafList; | |
85 {//Scoped to save memory | |
86 ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads); | |
87 readsProcessed+=reads.size(); | |
88 scafList=new ArrayList<ScafData>(reads.size()); | |
89 for(Read r : reads){ | |
90 basesProcessed+=r.length(); | |
91 scafList.add(new ScafData(r)); | |
92 } | |
93 } | |
94 {//Scoped to save memory | |
95 ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true); | |
96 ArrayList<GffLine> cds=allGffLines[0]; | |
97 ArrayList<GffLine> rrna=allGffLines[1]; | |
98 ArrayList<GffLine> trna=allGffLines[2]; | |
99 genesProcessed+=cds.size(); | |
100 genesProcessed+=(rrna==null ? 0 : rrna.size()); | |
101 genesProcessed+=(trna==null ? 0 : trna.size()); | |
102 | |
103 HashMap<String, ScafData> scafMap=makeScafMap(scafList); | |
104 fillScafDataCDS(cds, scafMap); | |
105 fillScafDataRNA(rrna, scafMap); | |
106 fillScafDataRNA(trna, scafMap); | |
107 } | |
108 | |
109 countBases(scafList); | |
110 if(PROCESS_PLUS_STRAND){ | |
111 processStrand(scafList, Shared.PLUS); | |
112 } | |
113 if(PROCESS_MINUS_STRAND){ | |
114 for(ScafData sd : scafList){ | |
115 sd.clear(); | |
116 sd.reverseComplement(); | |
117 } | |
118 processStrand(scafList, Shared.MINUS); | |
119 for(ScafData sd : scafList){ | |
120 sd.clear(); | |
121 sd.reverseComplement(); | |
122 } | |
123 } | |
124 return false; | |
125 } | |
126 | |
127 public void add(GeneModel pgm){ | |
128 for(int i=0; i<allContainers.length; i++){ | |
129 // System.err.println("merging "+allContainers[i].name); | |
130 allContainers[i].add(pgm.allContainers[i]); | |
131 } | |
132 | |
133 readsProcessed+=pgm.readsProcessed; | |
134 basesProcessed+=pgm.basesProcessed; | |
135 genesProcessed+=pgm.genesProcessed; | |
136 filesProcessed+=pgm.filesProcessed; | |
137 | |
138 // geneStartsProcessed+=pgm.geneStartsProcessed; | |
139 // tRNAProcessed+=pgm.tRNAProcessed; | |
140 // r16SProcessed+=pgm.r16SProcessed; | |
141 // r23SProcessed+=pgm.r23SProcessed; | |
142 // r5SProcessed+=pgm.r5SProcessed; | |
143 // r18SProcessed+=pgm.r18SProcessed; | |
144 | |
145 // fnames.addAll(pgm.fnames); | |
146 numFiles+=pgm.numFiles; | |
147 taxIds.addAll(pgm.taxIds); | |
148 Tools.add(baseCounts, pgm.baseCounts); | |
149 } | |
150 | |
151 public void multiplyBy(double mult) { | |
152 for(int i=0; i<allContainers.length; i++){ | |
153 allContainers[i].multiplyBy(mult); | |
154 } | |
155 | |
156 readsProcessed=Math.round(readsProcessed*mult); | |
157 basesProcessed=Math.round(basesProcessed*mult); | |
158 genesProcessed=Math.round(genesProcessed*mult); | |
159 filesProcessed=Math.round(filesProcessed*mult); | |
160 | |
161 for(int i=0; i<baseCounts.length; i++){ | |
162 baseCounts[i]=Math.round(baseCounts[i]*mult); | |
163 } | |
164 } | |
165 | |
166 /*--------------------------------------------------------------*/ | |
167 /*---------------- Outer Methods ----------------*/ | |
168 /*--------------------------------------------------------------*/ | |
169 | |
170 public float gc(){ | |
171 long a=baseCounts[0]; | |
172 long c=baseCounts[1]; | |
173 long g=baseCounts[2]; | |
174 long t=baseCounts[3]; | |
175 return (float)((g+c)/Tools.max(1.0, a+t+g+c)); | |
176 } | |
177 | |
178 HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){ | |
179 HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3); | |
180 for(ScafData sd : scafList){scafMap.put(sd.name, sd);} | |
181 for(ScafData sd : scafList){ | |
182 String name=sd.name; | |
183 int idx=name.indexOf(' '); | |
184 if(idx>=0){ | |
185 String prefix=name.substring(0, idx); | |
186 if(scafMap.containsKey(prefix)){ | |
187 assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'"; | |
188 }else{ | |
189 scafMap.put(prefix, sd); | |
190 } | |
191 } | |
192 } | |
193 return scafMap; | |
194 } | |
195 | |
196 public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){ | |
197 if(!callCDS){return;} | |
198 for(GffLine gline : cdsLines){ | |
199 ScafData sd=scafMap.get(gline.seqid); | |
200 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid; | |
201 sd.addCDS(gline); | |
202 } | |
203 } | |
204 | |
205 public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){ | |
206 for(GffLine gline : rnaLines){ | |
207 ScafData sd=scafMap.get(gline.seqid); | |
208 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid; | |
209 if(processType(gline.prokType())){ | |
210 sd.addRNA(gline); | |
211 } | |
212 } | |
213 } | |
214 | |
215 public void processStrand(ArrayList<ScafData> scafList, int strand){ | |
216 for(ScafData sd : scafList){ | |
217 processCDS(sd, strand); | |
218 processRNA(sd, strand); | |
219 } | |
220 } | |
221 | |
222 /*--------------------------------------------------------------*/ | |
223 /*---------------- Inner Methods ----------------*/ | |
224 /*--------------------------------------------------------------*/ | |
225 | |
226 private void countBases(ArrayList<ScafData> scafList){ | |
227 for(ScafData sd : scafList){ | |
228 countBases(sd.bases); | |
229 } | |
230 } | |
231 | |
232 private void countBases(byte[] bases){ | |
233 for(byte b : bases){ | |
234 int x=AminoAcid.baseToNumberACGTother[b]; | |
235 baseCounts[x]++; | |
236 } | |
237 } | |
238 | |
239 /*--------------------------------------------------------------*/ | |
240 /*---------------- Finding Codons ----------------*/ | |
241 /*--------------------------------------------------------------*/ | |
242 | |
243 private static void findStopCodons(byte[] bases, IntList list, BitSet valid){ | |
244 final int k=3; | |
245 final int mask=~((-1)<<(2*k)); | |
246 int kmer=0; | |
247 int len=0; | |
248 | |
249 for(int i=0; i<bases.length; i++){ | |
250 byte b=bases[i]; | |
251 int x=AminoAcid.baseToNumber[b]; | |
252 kmer=((kmer<<2)|x)&mask; | |
253 if(x>=0){ | |
254 len++; | |
255 if(len>=k){ | |
256 int point=i;//End of the stop codon | |
257 if(isStopCodon(kmer) && !valid.get(point)){ | |
258 list.add(point); | |
259 valid.set(point); | |
260 } | |
261 } | |
262 }else{len=0;} | |
263 } | |
264 | |
265 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise | |
266 if(!valid.get(i)){ | |
267 list.add(i); | |
268 } | |
269 } | |
270 } | |
271 | |
272 private static void findStartCodons(byte[] bases, IntList list, BitSet valid){ | |
273 final int k=3; | |
274 final int mask=~((-1)<<(2*k)); | |
275 int kmer=0; | |
276 int len=0; | |
277 | |
278 for(int i=0; i<bases.length; i++){ | |
279 byte b=bases[i]; | |
280 int x=AminoAcid.baseToNumber[b]; | |
281 kmer=((kmer<<2)|x)&mask; | |
282 if(x>=0){ | |
283 len++; | |
284 if(len>=k){ | |
285 int point=i-k+1;//Start of the start codon | |
286 if(isStartCodon(kmer) && !valid.get(point)){ | |
287 list.add(point); | |
288 valid.set(point); | |
289 } | |
290 } | |
291 }else{len=0;} | |
292 } | |
293 | |
294 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise | |
295 if(!valid.get(i)){ | |
296 list.add(i); | |
297 } | |
298 } | |
299 } | |
300 | |
301 /*--------------------------------------------------------------*/ | |
302 /*---------------- Processing GffLines ----------------*/ | |
303 /*--------------------------------------------------------------*/ | |
304 | |
305 private static void processGene(GffLine gline, ScafData sd){ | |
306 if(gline.length()<2){return;} | |
307 final int strand=gline.strand; | |
308 assert(strand==sd.strand()); | |
309 final byte[] frames=sd.frames; | |
310 int start=gline.start-1, stop=gline.stop-1; | |
311 if(start<0 || stop>=sd.length()){return;} | |
312 // assert(start<stop) : gline; //Not always true for euks... | |
313 if(strand==Shared.MINUS){ | |
314 int x=sd.length()-start-1; | |
315 int y=sd.length()-stop-1; | |
316 start=y; | |
317 stop=x; | |
318 | |
319 // String a=new String(sd.bases, start, 3); | |
320 // String b=new String(sd.bases, stop-2, 3); | |
321 //// assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3); | |
322 // outstream.println(a+", "+b+", "+start+", "+stop); | |
323 } | |
324 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name; | |
325 markFrames(start, stop, frames, kInnerCDS); | |
326 sd.starts.add(start); | |
327 sd.stops.add(stop); | |
328 // assert(gline.start!=337) : gline+"\n"+start+", "+stop; | |
329 } | |
330 | |
331 private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){ | |
332 final int strand=gline.strand; | |
333 assert(strand==sd.strand()); | |
334 final byte[] frames=sd.frames; | |
335 int start=gline.start-1, stop=gline.stop-1; | |
336 if(start<0 || stop>=sd.length()){return false;} | |
337 assert(start<=stop); | |
338 if(strand==Shared.MINUS){ | |
339 int x=sd.length()-start-1; | |
340 int y=sd.length()-stop-1; | |
341 start=y; | |
342 stop=x; | |
343 } | |
344 | |
345 if(AnalyzeGenes.alignRibo){ | |
346 // byte[] seq=sd.fetch(start, stop); | |
347 Read[] consensusReads=ProkObject.consensusReads(type); | |
348 byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null); | |
349 float minIdentity=ProkObject.minID(type); | |
350 if(universal!=null){ | |
351 int[] coords=KillSwitch.allocInt1D(2); | |
352 final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50)); | |
353 final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50)); | |
354 float id1=align(universal, sd.bases, a, b, minIdentity, coords); | |
355 final int rstart=coords[0], rstop=coords[1]; | |
356 // assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop; | |
357 if(id1<minIdentity){ | |
358 // System.err.println("Low identity: "+String.format("%.2s", 100*id1)); | |
359 return false; | |
360 }else{ | |
361 // System.err.println("Good identity: "+String.format("%.2s", 100*id1)); | |
362 } | |
363 if(AnalyzeGenes.adjustEndpoints){ | |
364 int startSlop=startSlop(type); | |
365 int stopSlop=stopSlop(type); | |
366 if(Tools.absdif(start, rstart)>startSlop){ | |
367 // System.err.println("rstart:\t"+start+" -> "+rstart); | |
368 start=rstart; | |
369 } | |
370 if(Tools.absdif(stop, rstop)>stopSlop){ | |
371 // System.err.println("rstop: \t"+stop+" -> "+rstop); | |
372 stop=rstop; | |
373 } | |
374 } | |
375 } | |
376 } | |
377 | |
378 StatsContainer sc=allContainers[type]; | |
379 sc.start.processPoint(sd.bases, start, 1); | |
380 sc.stop.processPoint(sd.bases, stop, 1); | |
381 assert(sc!=statsCDS); | |
382 | |
383 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name; | |
384 final byte flag=typeToFlag(type); | |
385 for(int i=start+kInnerRNA-1; i<=stop; i++){ | |
386 frames[i]|=flag; | |
387 } | |
388 return true; | |
389 } | |
390 | |
391 private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){ | |
392 // final int a=0, b=ref.length-1; | |
393 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); | |
394 final int minScore=ssa.minScoreByIdentity(query.length, minIdentity); | |
395 int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore); | |
396 if(max==null){return 0;} | |
397 | |
398 final int rows=max[0]; | |
399 final int maxCol=max[1]; | |
400 final int maxState=max[2]; | |
401 // final int maxScore=max[3]; | |
402 // final int maxStart=max[4]; | |
403 | |
404 //returns {score, bestRefStart, bestRefStop} | |
405 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight}; | |
406 final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState); | |
407 final int rstart=Tools.max(score[1], 0); | |
408 final int rstop=Tools.min(score[2], ref.length-1); | |
409 if(coords!=null){ | |
410 coords[0]=rstart; | |
411 coords[1]=rstop; | |
412 } | |
413 final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null); | |
414 return id; | |
415 } | |
416 | |
417 /** | |
418 * Each frame byte has a bit marked for valid coding frames. | |
419 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon. | |
420 * If frames[23]=0, then no coding kmer end at that location on this strand. | |
421 * @param start | |
422 * @param stop | |
423 * @param frames | |
424 * @param k | |
425 */ | |
426 private static void markFrames(int start, int stop, byte[] frames, int k){ | |
427 assert(start<=stop) : start+", "+stop; | |
428 for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){ | |
429 frames[i]=(byte)(frames[i]|frameBit); | |
430 frameBit<<=1; | |
431 if(frameBit>4){frameBit=1;} | |
432 } | |
433 // assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct | |
434 } | |
435 | |
436 /*--------------------------------------------------------------*/ | |
437 /*---------------- Counting Kmers ----------------*/ | |
438 /*--------------------------------------------------------------*/ | |
439 | |
440 private void processCDS(ScafData sd, int strand){ | |
441 if(!callCDS){return;} | |
442 ArrayList<GffLine> glines=sd.cdsLines[strand]; | |
443 for(GffLine gline : glines){ | |
444 assert(gline.strand==strand); | |
445 processGene(gline, sd); | |
446 statsCDS.addLength(gline.length()); | |
447 } | |
448 | |
449 statsCDS.inner.processCDSFrames(sd.bases, sd.frames); | |
450 BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1); | |
451 BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1); | |
452 // outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops."); | |
453 sd.clear(); | |
454 findStartCodons(sd.bases, sd.starts, startSet); | |
455 findStopCodons(sd.bases, sd.stops, stopSet); | |
456 // outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops."); | |
457 processEnds(sd.bases, statsCDS.start, sd.starts, 0); | |
458 processEnds(sd.bases, statsCDS.stop, sd.stops, 0); | |
459 } | |
460 | |
461 private static int getGlineType(GffLine gline, ScafData sd){ | |
462 if(!gline.inbounds(sd.length()) || gline.partial()){return -1;} | |
463 | |
464 final int length=gline.length(); | |
465 final int type=gline.prokType(); | |
466 if(type<0){ | |
467 return type; | |
468 }else if(type==CDS){ | |
469 return type; | |
470 }else if(type==tRNA && length>=40 && length<=120){ | |
471 return type; | |
472 }else if(type==r16S && length>=1440 && length<=1620){ | |
473 return type; | |
474 }else if(type==r23S && length>=2720 && length<=3170){ | |
475 return type; | |
476 }else if(type==r5S && length>=90 && length<=150){ | |
477 return type; | |
478 }else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range | |
479 return type; | |
480 } | |
481 return -1; | |
482 } | |
483 | |
484 private void processRNA(ScafData sd, int strand){ | |
485 sd.clear(); | |
486 ArrayList<GffLine> lines=sd.rnaLines[strand]; | |
487 for(GffLine gline : lines){ | |
488 assert(gline.strand==strand); | |
489 final int type=getGlineType(gline, sd); | |
490 if(type>0){ | |
491 StatsContainer sc=allContainers[type]; | |
492 sc.addLength(gline.length()); | |
493 processRnaLine(gline, sd, type); | |
494 } | |
495 } | |
496 processRnaInner(sd); | |
497 processRnaEnds(sd); | |
498 } | |
499 | |
500 void processRnaInner(ScafData sd){ | |
501 byte[] bases=sd.bases; | |
502 byte[] frames=sd.frames; | |
503 final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs. | |
504 final int mask=~((-1)<<(2*k)); | |
505 int kmer=0; | |
506 int len=0; | |
507 | |
508 for(int i=0; i<bases.length; i++){ | |
509 byte b=bases[i]; | |
510 int x=AminoAcid.baseToNumber[b]; | |
511 kmer=((kmer<<2)|x)&mask; | |
512 if(x>=0){ | |
513 len++; | |
514 if(len>=k){ | |
515 int vf=frames[i]; | |
516 for(int type=0; type<5; type++){ | |
517 int valid=vf&1; | |
518 rnaContainers[type].inner.add(kmer, 0, valid); | |
519 vf=(vf>>1); | |
520 } | |
521 } | |
522 }else{len=0;} | |
523 } | |
524 } | |
525 | |
526 void processRnaEnds(ScafData sd){ | |
527 byte[] bases=sd.bases; | |
528 | |
529 final int k=stats16S.start.k; | |
530 final int kMax=stats16S.start.kMax; | |
531 final int mask=stats16S.start.mask; | |
532 final long[] counts=new long[kMax];//TODO: Slow | |
533 | |
534 int kmer=0; | |
535 int len=0; | |
536 | |
537 for(int i=0; i<bases.length; i++){ | |
538 byte b=bases[i]; | |
539 int x=AminoAcid.baseToNumber[b]; | |
540 kmer=((kmer<<2)|x)&mask; | |
541 | |
542 if(x>=0){ | |
543 len++; | |
544 if(len>=k){ | |
545 counts[kmer]++; | |
546 } | |
547 }else{len=0;} | |
548 } | |
549 for(StatsContainer sc : rnaContainers){ | |
550 FrameStats fs=sc.start; | |
551 for(long[] array : fs.countsFalse){ | |
552 Tools.add(array, counts); | |
553 } | |
554 fs=sc.stop; | |
555 for(long[] array : fs.countsFalse){ | |
556 Tools.add(array, counts); | |
557 } | |
558 } | |
559 } | |
560 | |
561 private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){ | |
562 BitSet points=new BitSet(bases.length); | |
563 for(int i=0; i<list.size; i++){ | |
564 int point=list.get(i); | |
565 stats.processPoint(bases, list.get(i), valid); | |
566 points.set(point); | |
567 } | |
568 return points; | |
569 } | |
570 | |
571 /*--------------------------------------------------------------*/ | |
572 /*---------------- Scoring ----------------*/ | |
573 /*--------------------------------------------------------------*/ | |
574 | |
575 // //Assumes bases are in the correct strand | |
576 // public float calcStartScore(int start, int stop, byte[] bases){ | |
577 // float f=scorePoint(start, bases, startStats); | |
578 //// float ss=scoreStart2(start, bases, stop, innerKmerStats); | |
579 //// if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study. | |
580 // return f; | |
581 // } | |
582 // | |
583 // //Assumes bases are in the correct strand | |
584 // public float calcStopScore(int stop, byte[] bases){ | |
585 // float f=scorePoint(stop, bases, stopStats); | |
586 // return f; | |
587 // } | |
588 // | |
589 // //Assumes bases are in the correct strand | |
590 // public float calcRnaStartScore(int start, int stop, byte[] bases){ | |
591 // float f=scorePoint(start, bases, rrnaStartStats); | |
592 // return f; | |
593 // } | |
594 // | |
595 // //Assumes bases are in the correct strand | |
596 // public float calcRnaStopScore(int stop, byte[] bases){ | |
597 // float f=scorePoint(stop, bases, rrnaStopStats); | |
598 // return f; | |
599 // } | |
600 | |
601 // public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){ | |
602 // | |
603 // assert(stats.frames==3); | |
604 // final int k=stats.k; | |
605 // final int mask=~((-1)<<(2*k)); | |
606 // | |
607 // int kmer=0; | |
608 // int len=0; | |
609 // float score=0; | |
610 // int numKmers=0; | |
611 // | |
612 // for(int pos=start, currentFrame=startFrame; pos<stop; pos++){ | |
613 // final byte b=bases[pos]; | |
614 // final int x=AminoAcid.baseToNumber[b]; | |
615 // | |
616 // if(x>=0){ | |
617 // kmer=((kmer<<2)|x)&mask; | |
618 // len++; | |
619 // if(len>=k){ | |
620 // float prob=stats.probs[currentFrame][kmer]; | |
621 // float dif=prob-0.99f;//Prob above 1 is more likely than average | |
622 // score+=dif; | |
623 // numKmers++; | |
624 // } | |
625 // }else{ | |
626 // len=0; | |
627 // kmer=0; | |
628 // } | |
629 // | |
630 // currentFrame++; | |
631 // if(currentFrame>2){currentFrame=0;} | |
632 // } | |
633 // return score/Tools.max(1f, numKmers); | |
634 // } | |
635 // | |
636 // /** | |
637 // * TODO | |
638 // * Evaluate the relative difference between left and right frequencies. | |
639 // * The purpose is to find locations where the left side looks noncoding and the right side looks coding. | |
640 // * Does not currently yield useful results. | |
641 // */ | |
642 // public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){ | |
643 // final int k=stats.k; | |
644 // | |
645 // int start=point-45; | |
646 // if(start<0 || stop>bases.length){return 0.5f;} | |
647 // | |
648 // float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats); | |
649 // float right=calcKmerScore(point, stop-3, 0, bases, stats); | |
650 // return right-left; //High numbers are likely to be starts; non-starts should be near 0. | |
651 // } | |
652 | |
653 /*--------------------------------------------------------------*/ | |
654 /*---------------- toString ----------------*/ | |
655 /*--------------------------------------------------------------*/ | |
656 | |
657 @Override | |
658 public String toString(){ | |
659 return appendTo(new ByteBuilder()).toString(); | |
660 } | |
661 | |
662 public ByteBuilder appendTo(ByteBuilder bb){ | |
663 | |
664 // Collections.sort(fnames); | |
665 taxIds.sort(); | |
666 | |
667 bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n"); | |
668 bb.append("#files"); | |
669 bb.tab().append(numFiles); | |
670 // if(fnames.size()>5){ | |
671 // bb.tab().append(fnames.size()); | |
672 // }else{ | |
673 // for(String fname : fnames){ | |
674 // bb.tab().append(fname); | |
675 // } | |
676 // } | |
677 bb.nl(); | |
678 bb.append("#taxIDs"); | |
679 for(int i=0; i<taxIds.size; i++){ | |
680 bb.tab().append(taxIds.get(i)); | |
681 } | |
682 bb.nl(); | |
683 // bb.append("#k_inner\t").append(innerKmerLength).nl(); | |
684 // bb.append("#k_end\t").append(endKmerLength).nl(); | |
685 // bb.append("#start_left_offset\t").append(startLeftOffset).nl(); | |
686 // bb.append("#start_right_offset\t").append(startRightOffset).nl(); | |
687 // bb.append("#stop_left_offset\t").append(stopLeftOffset).nl(); | |
688 // bb.append("#stop_right_offset\t").append(stopRightOffset).nl(); | |
689 bb.append("#scaffolds\t").append(readsProcessed).nl(); | |
690 bb.append("#bases\t").append(basesProcessed).nl(); | |
691 bb.append("#genes\t").append(genesProcessed).nl(); | |
692 bb.append("#GC\t").append(gc(),2).nl(); | |
693 bb.append("#ACGTN"); | |
694 for(long x : baseCounts){ | |
695 bb.tab().append(x); | |
696 } | |
697 bb.nl(); | |
698 | |
699 for(StatsContainer sc : allContainers){ | |
700 sc.appendTo(bb); | |
701 } | |
702 assert(allContainers.length>5) : allContainers.length; | |
703 | |
704 return bb; | |
705 } | |
706 | |
707 /*--------------------------------------------------------------*/ | |
708 /*---------------- Stats ----------------*/ | |
709 /*--------------------------------------------------------------*/ | |
710 | |
711 public final StatsContainer statsCDS=new StatsContainer(CDS); | |
712 public final StatsContainer statstRNA=new StatsContainer(tRNA); | |
713 public final StatsContainer stats16S=new StatsContainer(r16S); | |
714 public final StatsContainer stats23S=new StatsContainer(r23S); | |
715 public final StatsContainer stats5S=new StatsContainer(r5S); | |
716 public final StatsContainer stats18S=new StatsContainer(r18S); | |
717 | |
718 final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S}; | |
719 final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S}; | |
720 //public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; | |
721 | |
722 // public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0); | |
723 // public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset); | |
724 // public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset); | |
725 // | |
726 // public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8); | |
727 // public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8); | |
728 // | |
729 // public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0); | |
730 // public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0); | |
731 // public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0); | |
732 // public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0); | |
733 // public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats}; | |
734 | |
735 /*--------------------------------------------------------------*/ | |
736 | |
737 // public ArrayList<String> fnames=new ArrayList<String>(); | |
738 public int numFiles=0; | |
739 public IntList taxIds=new IntList(); | |
740 | |
741 | |
742 /*--------------------------------------------------------------*/ | |
743 /*---------------- Fields ----------------*/ | |
744 /*--------------------------------------------------------------*/ | |
745 | |
746 private long maxReads=-1; | |
747 | |
748 long readsProcessed=0; | |
749 long basesProcessed=0; | |
750 long genesProcessed=0; | |
751 long filesProcessed=0; | |
752 long[] baseCounts=new long[5]; | |
753 | |
754 /*--------------------------------------------------------------*/ | |
755 /*---------------- Static Setters ----------------*/ | |
756 /*--------------------------------------------------------------*/ | |
757 | |
758 public synchronized void setStatics(){ | |
759 // assert(!setStatics); | |
760 kInnerCDS=statsCDS.inner.k; | |
761 kStartCDS=statsCDS.start.k; | |
762 kStopCDS=statsCDS.stop.k; | |
763 | |
764 setStartLeftOffset(statsCDS.start.leftOffset); | |
765 setStartRightOffset(statsCDS.start.rightOffset()); | |
766 | |
767 setStopLeftOffset(statsCDS.stop.leftOffset); | |
768 setStopRightOffset(statsCDS.stop.rightOffset()); | |
769 | |
770 kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here? | |
771 kStartRNA=stats16S.start.k; | |
772 kStopRNA=stats16S.stop.k; | |
773 setStatics=true; | |
774 } | |
775 | |
776 public static void setInnerK(int k){ | |
777 kInnerCDS=k; | |
778 } | |
779 | |
780 public static void setStartK(int k){ | |
781 kStartCDS=k; | |
782 } | |
783 | |
784 public static void setStopK(int k){ | |
785 kStopCDS=k; | |
786 } | |
787 | |
788 public static void setStartLeftOffset(int x){ | |
789 startLeftOffset=x; | |
790 startFrames=startLeftOffset+startRightOffset+1; | |
791 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames); | |
792 } | |
793 | |
794 public static void setStartRightOffset(int x){ | |
795 startRightOffset=x; | |
796 startFrames=startLeftOffset+startRightOffset+1; | |
797 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames); | |
798 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames; | |
799 } | |
800 | |
801 public static void setStopLeftOffset(int x){ | |
802 stopLeftOffset=x; | |
803 stopFrames=stopLeftOffset+stopRightOffset+1; | |
804 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames); | |
805 } | |
806 | |
807 public static void setStopRightOffset(int x){ | |
808 stopRightOffset=x; | |
809 stopFrames=stopLeftOffset+stopRightOffset+1; | |
810 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames); | |
811 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames; | |
812 } | |
813 | |
814 public static final boolean isStartCodon(int code){ | |
815 return code>=0 && code<=63 && isStartCodon[code]; | |
816 } | |
817 public static final boolean isStopCodon(int code){ | |
818 return code>=0 && code<=63 && isStopCodon[code]; | |
819 } | |
820 | |
821 /*--------------------------------------------------------------*/ | |
822 /*---------------- Class Init ----------------*/ | |
823 /*--------------------------------------------------------------*/ | |
824 | |
825 private static boolean[] makeIsCodon(String[] codons){ | |
826 boolean[] array=new boolean[64]; | |
827 for(String s : codons){ | |
828 int x=AminoAcid.toNumber(s); | |
829 array[x]=true; | |
830 } | |
831 return array; | |
832 } | |
833 | |
834 public static int kInnerCDS=6; | |
835 public static int kStartCDS=3; | |
836 public static int kStopCDS=3; | |
837 | |
838 static int startLeftOffset(){return startLeftOffset;} | |
839 static int startRightOffset(){return startRightOffset;} | |
840 static int startFrames(){return startFrames;} | |
841 | |
842 private static int startLeftOffset=21; //21 works well for k=4 | |
843 private static int startRightOffset=8; //10 works well for k=4 | |
844 private static int startFrames=startLeftOffset+startRightOffset+1; | |
845 | |
846 private static int stopLeftOffset=9; | |
847 private static int stopRightOffset=12; | |
848 private static int stopFrames=stopLeftOffset+stopRightOffset+1; | |
849 | |
850 private static boolean setStatics=false; | |
851 | |
852 /*--------------------------------------------------------------*/ | |
853 /*---------------- More Statics ----------------*/ | |
854 /*--------------------------------------------------------------*/ | |
855 | |
856 //E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9] | |
857 public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"}; | |
858 public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"}; | |
859 public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"}; | |
860 public static boolean[] isStartCodon=makeIsCodon(startCodons); | |
861 public static boolean[] isStopCodon=makeIsCodon(stopCodons); | |
862 | |
863 /*--------------------------------------------------------------*/ | |
864 | |
865 private static PrintStream outstream=System.err; | |
866 public static boolean verbose=false; | |
867 public static boolean errorState=false; | |
868 | |
869 } |