annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package prok;
jpayne@68 2
jpayne@68 3 import java.io.PrintStream;
jpayne@68 4 import java.util.ArrayList;
jpayne@68 5 import java.util.BitSet;
jpayne@68 6 import java.util.HashMap;
jpayne@68 7
jpayne@68 8 import aligner.SingleStateAlignerFlat2;
jpayne@68 9 import dna.AminoAcid;
jpayne@68 10 import fileIO.FileFormat;
jpayne@68 11 import gff.GffLine;
jpayne@68 12 import shared.KillSwitch;
jpayne@68 13 import shared.Shared;
jpayne@68 14 import shared.Tools;
jpayne@68 15 import stream.Read;
jpayne@68 16 import stream.ReadInputStream;
jpayne@68 17 import structures.ByteBuilder;
jpayne@68 18 import structures.IntList;
jpayne@68 19
jpayne@68 20 /**
jpayne@68 21 * This class is designed to store kmer frequencies related to gene
jpayne@68 22 * starts, stops, and interiors. It can be loaded from a pgm file.
jpayne@68 23 *
jpayne@68 24 * It's possible to use multiple GeneModels; for example, one for
jpayne@68 25 * each of several GC ranges or clades.
jpayne@68 26 * @author Brian Bushnell
jpayne@68 27 * @date Sep 24, 2018
jpayne@68 28 *
jpayne@68 29 */
jpayne@68 30 public class GeneModel extends ProkObject {
jpayne@68 31
jpayne@68 32 /*--------------------------------------------------------------*/
jpayne@68 33 /*---------------- Initialization ----------------*/
jpayne@68 34 /*--------------------------------------------------------------*/
jpayne@68 35
jpayne@68 36 public GeneModel(boolean fill){
jpayne@68 37 if(fill){
jpayne@68 38 fillContainers();
jpayne@68 39 }
jpayne@68 40 }
jpayne@68 41
jpayne@68 42 void fillContainers(){
jpayne@68 43 statsCDS.setInner(kInnerCDS, 3);
jpayne@68 44 statsCDS.setStart(kStartCDS, startFrames, startLeftOffset);
jpayne@68 45 statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset);
jpayne@68 46
jpayne@68 47 for(int i=0; i<rnaContainers.length; i++){
jpayne@68 48 StatsContainer sc=rnaContainers[i];
jpayne@68 49 sc.setInner(kInnerRNA, 1);
jpayne@68 50 }
jpayne@68 51
jpayne@68 52 statstRNA.setStart(kStartRNA, 14, 4);
jpayne@68 53 statstRNA.setStop(kStopRNA, 14, 6);
jpayne@68 54
jpayne@68 55 stats16S.setStart(kStartRNA, 20, 7);
jpayne@68 56 stats16S.setStop(kStopRNA, 12, 16);
jpayne@68 57
jpayne@68 58 stats23S.setStart(kStartRNA, 17, 3);
jpayne@68 59 stats23S.setStop(kStopRNA, 15, 12);
jpayne@68 60
jpayne@68 61 stats5S.setStart(kStartRNA, 20, 5);
jpayne@68 62 stats5S.setStop(kStopRNA, 15, 5);
jpayne@68 63
jpayne@68 64 stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined
jpayne@68 65 stats18S.setStop(kStopRNA, 12, 16);
jpayne@68 66 }
jpayne@68 67
jpayne@68 68 /*--------------------------------------------------------------*/
jpayne@68 69 /*---------------- Public Methods ----------------*/
jpayne@68 70 /*--------------------------------------------------------------*/
jpayne@68 71
jpayne@68 72 public boolean process(String genomeFname, String gffFname){
jpayne@68 73 // fnames.add(ReadWrite.stripPath(genomeFname));
jpayne@68 74 numFiles++;
jpayne@68 75 FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true);
jpayne@68 76 FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true);
jpayne@68 77
jpayne@68 78 if(fnaFile==null || gffFile==null){
jpayne@68 79 errorState=true;
jpayne@68 80 return true;
jpayne@68 81 }
jpayne@68 82 filesProcessed++;
jpayne@68 83
jpayne@68 84 ArrayList<ScafData> scafList;
jpayne@68 85 {//Scoped to save memory
jpayne@68 86 ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads);
jpayne@68 87 readsProcessed+=reads.size();
jpayne@68 88 scafList=new ArrayList<ScafData>(reads.size());
jpayne@68 89 for(Read r : reads){
jpayne@68 90 basesProcessed+=r.length();
jpayne@68 91 scafList.add(new ScafData(r));
jpayne@68 92 }
jpayne@68 93 }
jpayne@68 94 {//Scoped to save memory
jpayne@68 95 ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true);
jpayne@68 96 ArrayList<GffLine> cds=allGffLines[0];
jpayne@68 97 ArrayList<GffLine> rrna=allGffLines[1];
jpayne@68 98 ArrayList<GffLine> trna=allGffLines[2];
jpayne@68 99 genesProcessed+=cds.size();
jpayne@68 100 genesProcessed+=(rrna==null ? 0 : rrna.size());
jpayne@68 101 genesProcessed+=(trna==null ? 0 : trna.size());
jpayne@68 102
jpayne@68 103 HashMap<String, ScafData> scafMap=makeScafMap(scafList);
jpayne@68 104 fillScafDataCDS(cds, scafMap);
jpayne@68 105 fillScafDataRNA(rrna, scafMap);
jpayne@68 106 fillScafDataRNA(trna, scafMap);
jpayne@68 107 }
jpayne@68 108
jpayne@68 109 countBases(scafList);
jpayne@68 110 if(PROCESS_PLUS_STRAND){
jpayne@68 111 processStrand(scafList, Shared.PLUS);
jpayne@68 112 }
jpayne@68 113 if(PROCESS_MINUS_STRAND){
jpayne@68 114 for(ScafData sd : scafList){
jpayne@68 115 sd.clear();
jpayne@68 116 sd.reverseComplement();
jpayne@68 117 }
jpayne@68 118 processStrand(scafList, Shared.MINUS);
jpayne@68 119 for(ScafData sd : scafList){
jpayne@68 120 sd.clear();
jpayne@68 121 sd.reverseComplement();
jpayne@68 122 }
jpayne@68 123 }
jpayne@68 124 return false;
jpayne@68 125 }
jpayne@68 126
jpayne@68 127 public void add(GeneModel pgm){
jpayne@68 128 for(int i=0; i<allContainers.length; i++){
jpayne@68 129 // System.err.println("merging "+allContainers[i].name);
jpayne@68 130 allContainers[i].add(pgm.allContainers[i]);
jpayne@68 131 }
jpayne@68 132
jpayne@68 133 readsProcessed+=pgm.readsProcessed;
jpayne@68 134 basesProcessed+=pgm.basesProcessed;
jpayne@68 135 genesProcessed+=pgm.genesProcessed;
jpayne@68 136 filesProcessed+=pgm.filesProcessed;
jpayne@68 137
jpayne@68 138 // geneStartsProcessed+=pgm.geneStartsProcessed;
jpayne@68 139 // tRNAProcessed+=pgm.tRNAProcessed;
jpayne@68 140 // r16SProcessed+=pgm.r16SProcessed;
jpayne@68 141 // r23SProcessed+=pgm.r23SProcessed;
jpayne@68 142 // r5SProcessed+=pgm.r5SProcessed;
jpayne@68 143 // r18SProcessed+=pgm.r18SProcessed;
jpayne@68 144
jpayne@68 145 // fnames.addAll(pgm.fnames);
jpayne@68 146 numFiles+=pgm.numFiles;
jpayne@68 147 taxIds.addAll(pgm.taxIds);
jpayne@68 148 Tools.add(baseCounts, pgm.baseCounts);
jpayne@68 149 }
jpayne@68 150
jpayne@68 151 public void multiplyBy(double mult) {
jpayne@68 152 for(int i=0; i<allContainers.length; i++){
jpayne@68 153 allContainers[i].multiplyBy(mult);
jpayne@68 154 }
jpayne@68 155
jpayne@68 156 readsProcessed=Math.round(readsProcessed*mult);
jpayne@68 157 basesProcessed=Math.round(basesProcessed*mult);
jpayne@68 158 genesProcessed=Math.round(genesProcessed*mult);
jpayne@68 159 filesProcessed=Math.round(filesProcessed*mult);
jpayne@68 160
jpayne@68 161 for(int i=0; i<baseCounts.length; i++){
jpayne@68 162 baseCounts[i]=Math.round(baseCounts[i]*mult);
jpayne@68 163 }
jpayne@68 164 }
jpayne@68 165
jpayne@68 166 /*--------------------------------------------------------------*/
jpayne@68 167 /*---------------- Outer Methods ----------------*/
jpayne@68 168 /*--------------------------------------------------------------*/
jpayne@68 169
jpayne@68 170 public float gc(){
jpayne@68 171 long a=baseCounts[0];
jpayne@68 172 long c=baseCounts[1];
jpayne@68 173 long g=baseCounts[2];
jpayne@68 174 long t=baseCounts[3];
jpayne@68 175 return (float)((g+c)/Tools.max(1.0, a+t+g+c));
jpayne@68 176 }
jpayne@68 177
jpayne@68 178 HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){
jpayne@68 179 HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3);
jpayne@68 180 for(ScafData sd : scafList){scafMap.put(sd.name, sd);}
jpayne@68 181 for(ScafData sd : scafList){
jpayne@68 182 String name=sd.name;
jpayne@68 183 int idx=name.indexOf(' ');
jpayne@68 184 if(idx>=0){
jpayne@68 185 String prefix=name.substring(0, idx);
jpayne@68 186 if(scafMap.containsKey(prefix)){
jpayne@68 187 assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'";
jpayne@68 188 }else{
jpayne@68 189 scafMap.put(prefix, sd);
jpayne@68 190 }
jpayne@68 191 }
jpayne@68 192 }
jpayne@68 193 return scafMap;
jpayne@68 194 }
jpayne@68 195
jpayne@68 196 public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){
jpayne@68 197 if(!callCDS){return;}
jpayne@68 198 for(GffLine gline : cdsLines){
jpayne@68 199 ScafData sd=scafMap.get(gline.seqid);
jpayne@68 200 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
jpayne@68 201 sd.addCDS(gline);
jpayne@68 202 }
jpayne@68 203 }
jpayne@68 204
jpayne@68 205 public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){
jpayne@68 206 for(GffLine gline : rnaLines){
jpayne@68 207 ScafData sd=scafMap.get(gline.seqid);
jpayne@68 208 assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid;
jpayne@68 209 if(processType(gline.prokType())){
jpayne@68 210 sd.addRNA(gline);
jpayne@68 211 }
jpayne@68 212 }
jpayne@68 213 }
jpayne@68 214
jpayne@68 215 public void processStrand(ArrayList<ScafData> scafList, int strand){
jpayne@68 216 for(ScafData sd : scafList){
jpayne@68 217 processCDS(sd, strand);
jpayne@68 218 processRNA(sd, strand);
jpayne@68 219 }
jpayne@68 220 }
jpayne@68 221
jpayne@68 222 /*--------------------------------------------------------------*/
jpayne@68 223 /*---------------- Inner Methods ----------------*/
jpayne@68 224 /*--------------------------------------------------------------*/
jpayne@68 225
jpayne@68 226 private void countBases(ArrayList<ScafData> scafList){
jpayne@68 227 for(ScafData sd : scafList){
jpayne@68 228 countBases(sd.bases);
jpayne@68 229 }
jpayne@68 230 }
jpayne@68 231
jpayne@68 232 private void countBases(byte[] bases){
jpayne@68 233 for(byte b : bases){
jpayne@68 234 int x=AminoAcid.baseToNumberACGTother[b];
jpayne@68 235 baseCounts[x]++;
jpayne@68 236 }
jpayne@68 237 }
jpayne@68 238
jpayne@68 239 /*--------------------------------------------------------------*/
jpayne@68 240 /*---------------- Finding Codons ----------------*/
jpayne@68 241 /*--------------------------------------------------------------*/
jpayne@68 242
jpayne@68 243 private static void findStopCodons(byte[] bases, IntList list, BitSet valid){
jpayne@68 244 final int k=3;
jpayne@68 245 final int mask=~((-1)<<(2*k));
jpayne@68 246 int kmer=0;
jpayne@68 247 int len=0;
jpayne@68 248
jpayne@68 249 for(int i=0; i<bases.length; i++){
jpayne@68 250 byte b=bases[i];
jpayne@68 251 int x=AminoAcid.baseToNumber[b];
jpayne@68 252 kmer=((kmer<<2)|x)&mask;
jpayne@68 253 if(x>=0){
jpayne@68 254 len++;
jpayne@68 255 if(len>=k){
jpayne@68 256 int point=i;//End of the stop codon
jpayne@68 257 if(isStopCodon(kmer) && !valid.get(point)){
jpayne@68 258 list.add(point);
jpayne@68 259 valid.set(point);
jpayne@68 260 }
jpayne@68 261 }
jpayne@68 262 }else{len=0;}
jpayne@68 263 }
jpayne@68 264
jpayne@68 265 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
jpayne@68 266 if(!valid.get(i)){
jpayne@68 267 list.add(i);
jpayne@68 268 }
jpayne@68 269 }
jpayne@68 270 }
jpayne@68 271
jpayne@68 272 private static void findStartCodons(byte[] bases, IntList list, BitSet valid){
jpayne@68 273 final int k=3;
jpayne@68 274 final int mask=~((-1)<<(2*k));
jpayne@68 275 int kmer=0;
jpayne@68 276 int len=0;
jpayne@68 277
jpayne@68 278 for(int i=0; i<bases.length; i++){
jpayne@68 279 byte b=bases[i];
jpayne@68 280 int x=AminoAcid.baseToNumber[b];
jpayne@68 281 kmer=((kmer<<2)|x)&mask;
jpayne@68 282 if(x>=0){
jpayne@68 283 len++;
jpayne@68 284 if(len>=k){
jpayne@68 285 int point=i-k+1;//Start of the start codon
jpayne@68 286 if(isStartCodon(kmer) && !valid.get(point)){
jpayne@68 287 list.add(point);
jpayne@68 288 valid.set(point);
jpayne@68 289 }
jpayne@68 290 }
jpayne@68 291 }else{len=0;}
jpayne@68 292 }
jpayne@68 293
jpayne@68 294 for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise
jpayne@68 295 if(!valid.get(i)){
jpayne@68 296 list.add(i);
jpayne@68 297 }
jpayne@68 298 }
jpayne@68 299 }
jpayne@68 300
jpayne@68 301 /*--------------------------------------------------------------*/
jpayne@68 302 /*---------------- Processing GffLines ----------------*/
jpayne@68 303 /*--------------------------------------------------------------*/
jpayne@68 304
jpayne@68 305 private static void processGene(GffLine gline, ScafData sd){
jpayne@68 306 if(gline.length()<2){return;}
jpayne@68 307 final int strand=gline.strand;
jpayne@68 308 assert(strand==sd.strand());
jpayne@68 309 final byte[] frames=sd.frames;
jpayne@68 310 int start=gline.start-1, stop=gline.stop-1;
jpayne@68 311 if(start<0 || stop>=sd.length()){return;}
jpayne@68 312 // assert(start<stop) : gline; //Not always true for euks...
jpayne@68 313 if(strand==Shared.MINUS){
jpayne@68 314 int x=sd.length()-start-1;
jpayne@68 315 int y=sd.length()-stop-1;
jpayne@68 316 start=y;
jpayne@68 317 stop=x;
jpayne@68 318
jpayne@68 319 // String a=new String(sd.bases, start, 3);
jpayne@68 320 // String b=new String(sd.bases, stop-2, 3);
jpayne@68 321 //// assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3);
jpayne@68 322 // outstream.println(a+", "+b+", "+start+", "+stop);
jpayne@68 323 }
jpayne@68 324 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
jpayne@68 325 markFrames(start, stop, frames, kInnerCDS);
jpayne@68 326 sd.starts.add(start);
jpayne@68 327 sd.stops.add(stop);
jpayne@68 328 // assert(gline.start!=337) : gline+"\n"+start+", "+stop;
jpayne@68 329 }
jpayne@68 330
jpayne@68 331 private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){
jpayne@68 332 final int strand=gline.strand;
jpayne@68 333 assert(strand==sd.strand());
jpayne@68 334 final byte[] frames=sd.frames;
jpayne@68 335 int start=gline.start-1, stop=gline.stop-1;
jpayne@68 336 if(start<0 || stop>=sd.length()){return false;}
jpayne@68 337 assert(start<=stop);
jpayne@68 338 if(strand==Shared.MINUS){
jpayne@68 339 int x=sd.length()-start-1;
jpayne@68 340 int y=sd.length()-stop-1;
jpayne@68 341 start=y;
jpayne@68 342 stop=x;
jpayne@68 343 }
jpayne@68 344
jpayne@68 345 if(AnalyzeGenes.alignRibo){
jpayne@68 346 // byte[] seq=sd.fetch(start, stop);
jpayne@68 347 Read[] consensusReads=ProkObject.consensusReads(type);
jpayne@68 348 byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null);
jpayne@68 349 float minIdentity=ProkObject.minID(type);
jpayne@68 350 if(universal!=null){
jpayne@68 351 int[] coords=KillSwitch.allocInt1D(2);
jpayne@68 352 final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50));
jpayne@68 353 final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50));
jpayne@68 354 float id1=align(universal, sd.bases, a, b, minIdentity, coords);
jpayne@68 355 final int rstart=coords[0], rstop=coords[1];
jpayne@68 356 // assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
jpayne@68 357 if(id1<minIdentity){
jpayne@68 358 // System.err.println("Low identity: "+String.format("%.2s", 100*id1));
jpayne@68 359 return false;
jpayne@68 360 }else{
jpayne@68 361 // System.err.println("Good identity: "+String.format("%.2s", 100*id1));
jpayne@68 362 }
jpayne@68 363 if(AnalyzeGenes.adjustEndpoints){
jpayne@68 364 int startSlop=startSlop(type);
jpayne@68 365 int stopSlop=stopSlop(type);
jpayne@68 366 if(Tools.absdif(start, rstart)>startSlop){
jpayne@68 367 // System.err.println("rstart:\t"+start+" -> "+rstart);
jpayne@68 368 start=rstart;
jpayne@68 369 }
jpayne@68 370 if(Tools.absdif(stop, rstop)>stopSlop){
jpayne@68 371 // System.err.println("rstop: \t"+stop+" -> "+rstop);
jpayne@68 372 stop=rstop;
jpayne@68 373 }
jpayne@68 374 }
jpayne@68 375 }
jpayne@68 376 }
jpayne@68 377
jpayne@68 378 StatsContainer sc=allContainers[type];
jpayne@68 379 sc.start.processPoint(sd.bases, start, 1);
jpayne@68 380 sc.stop.processPoint(sd.bases, stop, 1);
jpayne@68 381 assert(sc!=statsCDS);
jpayne@68 382
jpayne@68 383 assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name;
jpayne@68 384 final byte flag=typeToFlag(type);
jpayne@68 385 for(int i=start+kInnerRNA-1; i<=stop; i++){
jpayne@68 386 frames[i]|=flag;
jpayne@68 387 }
jpayne@68 388 return true;
jpayne@68 389 }
jpayne@68 390
jpayne@68 391 private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){
jpayne@68 392 // final int a=0, b=ref.length-1;
jpayne@68 393 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA();
jpayne@68 394 final int minScore=ssa.minScoreByIdentity(query.length, minIdentity);
jpayne@68 395 int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore);
jpayne@68 396 if(max==null){return 0;}
jpayne@68 397
jpayne@68 398 final int rows=max[0];
jpayne@68 399 final int maxCol=max[1];
jpayne@68 400 final int maxState=max[2];
jpayne@68 401 // final int maxScore=max[3];
jpayne@68 402 // final int maxStart=max[4];
jpayne@68 403
jpayne@68 404 //returns {score, bestRefStart, bestRefStop}
jpayne@68 405 //padded: {score, bestRefStart, bestRefStop, padLeft, padRight};
jpayne@68 406 final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState);
jpayne@68 407 final int rstart=Tools.max(score[1], 0);
jpayne@68 408 final int rstop=Tools.min(score[2], ref.length-1);
jpayne@68 409 if(coords!=null){
jpayne@68 410 coords[0]=rstart;
jpayne@68 411 coords[1]=rstop;
jpayne@68 412 }
jpayne@68 413 final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null);
jpayne@68 414 return id;
jpayne@68 415 }
jpayne@68 416
jpayne@68 417 /**
jpayne@68 418 * Each frame byte has a bit marked for valid coding frames.
jpayne@68 419 * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon.
jpayne@68 420 * If frames[23]=0, then no coding kmer end at that location on this strand.
jpayne@68 421 * @param start
jpayne@68 422 * @param stop
jpayne@68 423 * @param frames
jpayne@68 424 * @param k
jpayne@68 425 */
jpayne@68 426 private static void markFrames(int start, int stop, byte[] frames, int k){
jpayne@68 427 assert(start<=stop) : start+", "+stop;
jpayne@68 428 for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){
jpayne@68 429 frames[i]=(byte)(frames[i]|frameBit);
jpayne@68 430 frameBit<<=1;
jpayne@68 431 if(frameBit>4){frameBit=1;}
jpayne@68 432 }
jpayne@68 433 // assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct
jpayne@68 434 }
jpayne@68 435
jpayne@68 436 /*--------------------------------------------------------------*/
jpayne@68 437 /*---------------- Counting Kmers ----------------*/
jpayne@68 438 /*--------------------------------------------------------------*/
jpayne@68 439
jpayne@68 440 private void processCDS(ScafData sd, int strand){
jpayne@68 441 if(!callCDS){return;}
jpayne@68 442 ArrayList<GffLine> glines=sd.cdsLines[strand];
jpayne@68 443 for(GffLine gline : glines){
jpayne@68 444 assert(gline.strand==strand);
jpayne@68 445 processGene(gline, sd);
jpayne@68 446 statsCDS.addLength(gline.length());
jpayne@68 447 }
jpayne@68 448
jpayne@68 449 statsCDS.inner.processCDSFrames(sd.bases, sd.frames);
jpayne@68 450 BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1);
jpayne@68 451 BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1);
jpayne@68 452 // outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops.");
jpayne@68 453 sd.clear();
jpayne@68 454 findStartCodons(sd.bases, sd.starts, startSet);
jpayne@68 455 findStopCodons(sd.bases, sd.stops, stopSet);
jpayne@68 456 // outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops.");
jpayne@68 457 processEnds(sd.bases, statsCDS.start, sd.starts, 0);
jpayne@68 458 processEnds(sd.bases, statsCDS.stop, sd.stops, 0);
jpayne@68 459 }
jpayne@68 460
jpayne@68 461 private static int getGlineType(GffLine gline, ScafData sd){
jpayne@68 462 if(!gline.inbounds(sd.length()) || gline.partial()){return -1;}
jpayne@68 463
jpayne@68 464 final int length=gline.length();
jpayne@68 465 final int type=gline.prokType();
jpayne@68 466 if(type<0){
jpayne@68 467 return type;
jpayne@68 468 }else if(type==CDS){
jpayne@68 469 return type;
jpayne@68 470 }else if(type==tRNA && length>=40 && length<=120){
jpayne@68 471 return type;
jpayne@68 472 }else if(type==r16S && length>=1440 && length<=1620){
jpayne@68 473 return type;
jpayne@68 474 }else if(type==r23S && length>=2720 && length<=3170){
jpayne@68 475 return type;
jpayne@68 476 }else if(type==r5S && length>=90 && length<=150){
jpayne@68 477 return type;
jpayne@68 478 }else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range
jpayne@68 479 return type;
jpayne@68 480 }
jpayne@68 481 return -1;
jpayne@68 482 }
jpayne@68 483
jpayne@68 484 private void processRNA(ScafData sd, int strand){
jpayne@68 485 sd.clear();
jpayne@68 486 ArrayList<GffLine> lines=sd.rnaLines[strand];
jpayne@68 487 for(GffLine gline : lines){
jpayne@68 488 assert(gline.strand==strand);
jpayne@68 489 final int type=getGlineType(gline, sd);
jpayne@68 490 if(type>0){
jpayne@68 491 StatsContainer sc=allContainers[type];
jpayne@68 492 sc.addLength(gline.length());
jpayne@68 493 processRnaLine(gline, sd, type);
jpayne@68 494 }
jpayne@68 495 }
jpayne@68 496 processRnaInner(sd);
jpayne@68 497 processRnaEnds(sd);
jpayne@68 498 }
jpayne@68 499
jpayne@68 500 void processRnaInner(ScafData sd){
jpayne@68 501 byte[] bases=sd.bases;
jpayne@68 502 byte[] frames=sd.frames;
jpayne@68 503 final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs.
jpayne@68 504 final int mask=~((-1)<<(2*k));
jpayne@68 505 int kmer=0;
jpayne@68 506 int len=0;
jpayne@68 507
jpayne@68 508 for(int i=0; i<bases.length; i++){
jpayne@68 509 byte b=bases[i];
jpayne@68 510 int x=AminoAcid.baseToNumber[b];
jpayne@68 511 kmer=((kmer<<2)|x)&mask;
jpayne@68 512 if(x>=0){
jpayne@68 513 len++;
jpayne@68 514 if(len>=k){
jpayne@68 515 int vf=frames[i];
jpayne@68 516 for(int type=0; type<5; type++){
jpayne@68 517 int valid=vf&1;
jpayne@68 518 rnaContainers[type].inner.add(kmer, 0, valid);
jpayne@68 519 vf=(vf>>1);
jpayne@68 520 }
jpayne@68 521 }
jpayne@68 522 }else{len=0;}
jpayne@68 523 }
jpayne@68 524 }
jpayne@68 525
jpayne@68 526 void processRnaEnds(ScafData sd){
jpayne@68 527 byte[] bases=sd.bases;
jpayne@68 528
jpayne@68 529 final int k=stats16S.start.k;
jpayne@68 530 final int kMax=stats16S.start.kMax;
jpayne@68 531 final int mask=stats16S.start.mask;
jpayne@68 532 final long[] counts=new long[kMax];//TODO: Slow
jpayne@68 533
jpayne@68 534 int kmer=0;
jpayne@68 535 int len=0;
jpayne@68 536
jpayne@68 537 for(int i=0; i<bases.length; i++){
jpayne@68 538 byte b=bases[i];
jpayne@68 539 int x=AminoAcid.baseToNumber[b];
jpayne@68 540 kmer=((kmer<<2)|x)&mask;
jpayne@68 541
jpayne@68 542 if(x>=0){
jpayne@68 543 len++;
jpayne@68 544 if(len>=k){
jpayne@68 545 counts[kmer]++;
jpayne@68 546 }
jpayne@68 547 }else{len=0;}
jpayne@68 548 }
jpayne@68 549 for(StatsContainer sc : rnaContainers){
jpayne@68 550 FrameStats fs=sc.start;
jpayne@68 551 for(long[] array : fs.countsFalse){
jpayne@68 552 Tools.add(array, counts);
jpayne@68 553 }
jpayne@68 554 fs=sc.stop;
jpayne@68 555 for(long[] array : fs.countsFalse){
jpayne@68 556 Tools.add(array, counts);
jpayne@68 557 }
jpayne@68 558 }
jpayne@68 559 }
jpayne@68 560
jpayne@68 561 private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){
jpayne@68 562 BitSet points=new BitSet(bases.length);
jpayne@68 563 for(int i=0; i<list.size; i++){
jpayne@68 564 int point=list.get(i);
jpayne@68 565 stats.processPoint(bases, list.get(i), valid);
jpayne@68 566 points.set(point);
jpayne@68 567 }
jpayne@68 568 return points;
jpayne@68 569 }
jpayne@68 570
jpayne@68 571 /*--------------------------------------------------------------*/
jpayne@68 572 /*---------------- Scoring ----------------*/
jpayne@68 573 /*--------------------------------------------------------------*/
jpayne@68 574
jpayne@68 575 // //Assumes bases are in the correct strand
jpayne@68 576 // public float calcStartScore(int start, int stop, byte[] bases){
jpayne@68 577 // float f=scorePoint(start, bases, startStats);
jpayne@68 578 //// float ss=scoreStart2(start, bases, stop, innerKmerStats);
jpayne@68 579 //// if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study.
jpayne@68 580 // return f;
jpayne@68 581 // }
jpayne@68 582 //
jpayne@68 583 // //Assumes bases are in the correct strand
jpayne@68 584 // public float calcStopScore(int stop, byte[] bases){
jpayne@68 585 // float f=scorePoint(stop, bases, stopStats);
jpayne@68 586 // return f;
jpayne@68 587 // }
jpayne@68 588 //
jpayne@68 589 // //Assumes bases are in the correct strand
jpayne@68 590 // public float calcRnaStartScore(int start, int stop, byte[] bases){
jpayne@68 591 // float f=scorePoint(start, bases, rrnaStartStats);
jpayne@68 592 // return f;
jpayne@68 593 // }
jpayne@68 594 //
jpayne@68 595 // //Assumes bases are in the correct strand
jpayne@68 596 // public float calcRnaStopScore(int stop, byte[] bases){
jpayne@68 597 // float f=scorePoint(stop, bases, rrnaStopStats);
jpayne@68 598 // return f;
jpayne@68 599 // }
jpayne@68 600
jpayne@68 601 // public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){
jpayne@68 602 //
jpayne@68 603 // assert(stats.frames==3);
jpayne@68 604 // final int k=stats.k;
jpayne@68 605 // final int mask=~((-1)<<(2*k));
jpayne@68 606 //
jpayne@68 607 // int kmer=0;
jpayne@68 608 // int len=0;
jpayne@68 609 // float score=0;
jpayne@68 610 // int numKmers=0;
jpayne@68 611 //
jpayne@68 612 // for(int pos=start, currentFrame=startFrame; pos<stop; pos++){
jpayne@68 613 // final byte b=bases[pos];
jpayne@68 614 // final int x=AminoAcid.baseToNumber[b];
jpayne@68 615 //
jpayne@68 616 // if(x>=0){
jpayne@68 617 // kmer=((kmer<<2)|x)&mask;
jpayne@68 618 // len++;
jpayne@68 619 // if(len>=k){
jpayne@68 620 // float prob=stats.probs[currentFrame][kmer];
jpayne@68 621 // float dif=prob-0.99f;//Prob above 1 is more likely than average
jpayne@68 622 // score+=dif;
jpayne@68 623 // numKmers++;
jpayne@68 624 // }
jpayne@68 625 // }else{
jpayne@68 626 // len=0;
jpayne@68 627 // kmer=0;
jpayne@68 628 // }
jpayne@68 629 //
jpayne@68 630 // currentFrame++;
jpayne@68 631 // if(currentFrame>2){currentFrame=0;}
jpayne@68 632 // }
jpayne@68 633 // return score/Tools.max(1f, numKmers);
jpayne@68 634 // }
jpayne@68 635 //
jpayne@68 636 // /**
jpayne@68 637 // * TODO
jpayne@68 638 // * Evaluate the relative difference between left and right frequencies.
jpayne@68 639 // * The purpose is to find locations where the left side looks noncoding and the right side looks coding.
jpayne@68 640 // * Does not currently yield useful results.
jpayne@68 641 // */
jpayne@68 642 // public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){
jpayne@68 643 // final int k=stats.k;
jpayne@68 644 //
jpayne@68 645 // int start=point-45;
jpayne@68 646 // if(start<0 || stop>bases.length){return 0.5f;}
jpayne@68 647 //
jpayne@68 648 // float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats);
jpayne@68 649 // float right=calcKmerScore(point, stop-3, 0, bases, stats);
jpayne@68 650 // return right-left; //High numbers are likely to be starts; non-starts should be near 0.
jpayne@68 651 // }
jpayne@68 652
jpayne@68 653 /*--------------------------------------------------------------*/
jpayne@68 654 /*---------------- toString ----------------*/
jpayne@68 655 /*--------------------------------------------------------------*/
jpayne@68 656
jpayne@68 657 @Override
jpayne@68 658 public String toString(){
jpayne@68 659 return appendTo(new ByteBuilder()).toString();
jpayne@68 660 }
jpayne@68 661
jpayne@68 662 public ByteBuilder appendTo(ByteBuilder bb){
jpayne@68 663
jpayne@68 664 // Collections.sort(fnames);
jpayne@68 665 taxIds.sort();
jpayne@68 666
jpayne@68 667 bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n");
jpayne@68 668 bb.append("#files");
jpayne@68 669 bb.tab().append(numFiles);
jpayne@68 670 // if(fnames.size()>5){
jpayne@68 671 // bb.tab().append(fnames.size());
jpayne@68 672 // }else{
jpayne@68 673 // for(String fname : fnames){
jpayne@68 674 // bb.tab().append(fname);
jpayne@68 675 // }
jpayne@68 676 // }
jpayne@68 677 bb.nl();
jpayne@68 678 bb.append("#taxIDs");
jpayne@68 679 for(int i=0; i<taxIds.size; i++){
jpayne@68 680 bb.tab().append(taxIds.get(i));
jpayne@68 681 }
jpayne@68 682 bb.nl();
jpayne@68 683 // bb.append("#k_inner\t").append(innerKmerLength).nl();
jpayne@68 684 // bb.append("#k_end\t").append(endKmerLength).nl();
jpayne@68 685 // bb.append("#start_left_offset\t").append(startLeftOffset).nl();
jpayne@68 686 // bb.append("#start_right_offset\t").append(startRightOffset).nl();
jpayne@68 687 // bb.append("#stop_left_offset\t").append(stopLeftOffset).nl();
jpayne@68 688 // bb.append("#stop_right_offset\t").append(stopRightOffset).nl();
jpayne@68 689 bb.append("#scaffolds\t").append(readsProcessed).nl();
jpayne@68 690 bb.append("#bases\t").append(basesProcessed).nl();
jpayne@68 691 bb.append("#genes\t").append(genesProcessed).nl();
jpayne@68 692 bb.append("#GC\t").append(gc(),2).nl();
jpayne@68 693 bb.append("#ACGTN");
jpayne@68 694 for(long x : baseCounts){
jpayne@68 695 bb.tab().append(x);
jpayne@68 696 }
jpayne@68 697 bb.nl();
jpayne@68 698
jpayne@68 699 for(StatsContainer sc : allContainers){
jpayne@68 700 sc.appendTo(bb);
jpayne@68 701 }
jpayne@68 702 assert(allContainers.length>5) : allContainers.length;
jpayne@68 703
jpayne@68 704 return bb;
jpayne@68 705 }
jpayne@68 706
jpayne@68 707 /*--------------------------------------------------------------*/
jpayne@68 708 /*---------------- Stats ----------------*/
jpayne@68 709 /*--------------------------------------------------------------*/
jpayne@68 710
jpayne@68 711 public final StatsContainer statsCDS=new StatsContainer(CDS);
jpayne@68 712 public final StatsContainer statstRNA=new StatsContainer(tRNA);
jpayne@68 713 public final StatsContainer stats16S=new StatsContainer(r16S);
jpayne@68 714 public final StatsContainer stats23S=new StatsContainer(r23S);
jpayne@68 715 public final StatsContainer stats5S=new StatsContainer(r5S);
jpayne@68 716 public final StatsContainer stats18S=new StatsContainer(r18S);
jpayne@68 717
jpayne@68 718 final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S};
jpayne@68 719 final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S};
jpayne@68 720 //public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
jpayne@68 721
jpayne@68 722 // public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0);
jpayne@68 723 // public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset);
jpayne@68 724 // public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset);
jpayne@68 725 //
jpayne@68 726 // public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8);
jpayne@68 727 // public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8);
jpayne@68 728 //
jpayne@68 729 // public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0);
jpayne@68 730 // public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0);
jpayne@68 731 // public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0);
jpayne@68 732 // public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0);
jpayne@68 733 // public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats};
jpayne@68 734
jpayne@68 735 /*--------------------------------------------------------------*/
jpayne@68 736
jpayne@68 737 // public ArrayList<String> fnames=new ArrayList<String>();
jpayne@68 738 public int numFiles=0;
jpayne@68 739 public IntList taxIds=new IntList();
jpayne@68 740
jpayne@68 741
jpayne@68 742 /*--------------------------------------------------------------*/
jpayne@68 743 /*---------------- Fields ----------------*/
jpayne@68 744 /*--------------------------------------------------------------*/
jpayne@68 745
jpayne@68 746 private long maxReads=-1;
jpayne@68 747
jpayne@68 748 long readsProcessed=0;
jpayne@68 749 long basesProcessed=0;
jpayne@68 750 long genesProcessed=0;
jpayne@68 751 long filesProcessed=0;
jpayne@68 752 long[] baseCounts=new long[5];
jpayne@68 753
jpayne@68 754 /*--------------------------------------------------------------*/
jpayne@68 755 /*---------------- Static Setters ----------------*/
jpayne@68 756 /*--------------------------------------------------------------*/
jpayne@68 757
jpayne@68 758 public synchronized void setStatics(){
jpayne@68 759 // assert(!setStatics);
jpayne@68 760 kInnerCDS=statsCDS.inner.k;
jpayne@68 761 kStartCDS=statsCDS.start.k;
jpayne@68 762 kStopCDS=statsCDS.stop.k;
jpayne@68 763
jpayne@68 764 setStartLeftOffset(statsCDS.start.leftOffset);
jpayne@68 765 setStartRightOffset(statsCDS.start.rightOffset());
jpayne@68 766
jpayne@68 767 setStopLeftOffset(statsCDS.stop.leftOffset);
jpayne@68 768 setStopRightOffset(statsCDS.stop.rightOffset());
jpayne@68 769
jpayne@68 770 kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here?
jpayne@68 771 kStartRNA=stats16S.start.k;
jpayne@68 772 kStopRNA=stats16S.stop.k;
jpayne@68 773 setStatics=true;
jpayne@68 774 }
jpayne@68 775
jpayne@68 776 public static void setInnerK(int k){
jpayne@68 777 kInnerCDS=k;
jpayne@68 778 }
jpayne@68 779
jpayne@68 780 public static void setStartK(int k){
jpayne@68 781 kStartCDS=k;
jpayne@68 782 }
jpayne@68 783
jpayne@68 784 public static void setStopK(int k){
jpayne@68 785 kStopCDS=k;
jpayne@68 786 }
jpayne@68 787
jpayne@68 788 public static void setStartLeftOffset(int x){
jpayne@68 789 startLeftOffset=x;
jpayne@68 790 startFrames=startLeftOffset+startRightOffset+1;
jpayne@68 791 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
jpayne@68 792 }
jpayne@68 793
jpayne@68 794 public static void setStartRightOffset(int x){
jpayne@68 795 startRightOffset=x;
jpayne@68 796 startFrames=startLeftOffset+startRightOffset+1;
jpayne@68 797 // System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames);
jpayne@68 798 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
jpayne@68 799 }
jpayne@68 800
jpayne@68 801 public static void setStopLeftOffset(int x){
jpayne@68 802 stopLeftOffset=x;
jpayne@68 803 stopFrames=stopLeftOffset+stopRightOffset+1;
jpayne@68 804 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
jpayne@68 805 }
jpayne@68 806
jpayne@68 807 public static void setStopRightOffset(int x){
jpayne@68 808 stopRightOffset=x;
jpayne@68 809 stopFrames=stopLeftOffset+stopRightOffset+1;
jpayne@68 810 // System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames);
jpayne@68 811 // assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames;
jpayne@68 812 }
jpayne@68 813
jpayne@68 814 public static final boolean isStartCodon(int code){
jpayne@68 815 return code>=0 && code<=63 && isStartCodon[code];
jpayne@68 816 }
jpayne@68 817 public static final boolean isStopCodon(int code){
jpayne@68 818 return code>=0 && code<=63 && isStopCodon[code];
jpayne@68 819 }
jpayne@68 820
jpayne@68 821 /*--------------------------------------------------------------*/
jpayne@68 822 /*---------------- Class Init ----------------*/
jpayne@68 823 /*--------------------------------------------------------------*/
jpayne@68 824
jpayne@68 825 private static boolean[] makeIsCodon(String[] codons){
jpayne@68 826 boolean[] array=new boolean[64];
jpayne@68 827 for(String s : codons){
jpayne@68 828 int x=AminoAcid.toNumber(s);
jpayne@68 829 array[x]=true;
jpayne@68 830 }
jpayne@68 831 return array;
jpayne@68 832 }
jpayne@68 833
jpayne@68 834 public static int kInnerCDS=6;
jpayne@68 835 public static int kStartCDS=3;
jpayne@68 836 public static int kStopCDS=3;
jpayne@68 837
jpayne@68 838 static int startLeftOffset(){return startLeftOffset;}
jpayne@68 839 static int startRightOffset(){return startRightOffset;}
jpayne@68 840 static int startFrames(){return startFrames;}
jpayne@68 841
jpayne@68 842 private static int startLeftOffset=21; //21 works well for k=4
jpayne@68 843 private static int startRightOffset=8; //10 works well for k=4
jpayne@68 844 private static int startFrames=startLeftOffset+startRightOffset+1;
jpayne@68 845
jpayne@68 846 private static int stopLeftOffset=9;
jpayne@68 847 private static int stopRightOffset=12;
jpayne@68 848 private static int stopFrames=stopLeftOffset+stopRightOffset+1;
jpayne@68 849
jpayne@68 850 private static boolean setStatics=false;
jpayne@68 851
jpayne@68 852 /*--------------------------------------------------------------*/
jpayne@68 853 /*---------------- More Statics ----------------*/
jpayne@68 854 /*--------------------------------------------------------------*/
jpayne@68 855
jpayne@68 856 //E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9]
jpayne@68 857 public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"};
jpayne@68 858 public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"};
jpayne@68 859 public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"};
jpayne@68 860 public static boolean[] isStartCodon=makeIsCodon(startCodons);
jpayne@68 861 public static boolean[] isStopCodon=makeIsCodon(stopCodons);
jpayne@68 862
jpayne@68 863 /*--------------------------------------------------------------*/
jpayne@68 864
jpayne@68 865 private static PrintStream outstream=System.err;
jpayne@68 866 public static boolean verbose=false;
jpayne@68 867 public static boolean errorState=false;
jpayne@68 868
jpayne@68 869 }