Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/GeneModel.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,869 @@ +package prok; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; + +import aligner.SingleStateAlignerFlat2; +import dna.AminoAcid; +import fileIO.FileFormat; +import gff.GffLine; +import shared.KillSwitch; +import shared.Shared; +import shared.Tools; +import stream.Read; +import stream.ReadInputStream; +import structures.ByteBuilder; +import structures.IntList; + +/** + * This class is designed to store kmer frequencies related to gene + * starts, stops, and interiors. It can be loaded from a pgm file. + * + * It's possible to use multiple GeneModels; for example, one for + * each of several GC ranges or clades. + * @author Brian Bushnell + * @date Sep 24, 2018 + * + */ +public class GeneModel extends ProkObject { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + public GeneModel(boolean fill){ + if(fill){ + fillContainers(); + } + } + + void fillContainers(){ + statsCDS.setInner(kInnerCDS, 3); + statsCDS.setStart(kStartCDS, startFrames, startLeftOffset); + statsCDS.setStop(kStopCDS, stopFrames, stopLeftOffset); + + for(int i=0; i<rnaContainers.length; i++){ + StatsContainer sc=rnaContainers[i]; + sc.setInner(kInnerRNA, 1); + } + + statstRNA.setStart(kStartRNA, 14, 4); + statstRNA.setStop(kStopRNA, 14, 6); + + stats16S.setStart(kStartRNA, 20, 7); + stats16S.setStop(kStopRNA, 12, 16); + + stats23S.setStart(kStartRNA, 17, 3); + stats23S.setStop(kStopRNA, 15, 12); + + stats5S.setStart(kStartRNA, 20, 5); + stats5S.setStop(kStopRNA, 15, 5); + + stats18S.setStart(kStartRNA, 20, 7);//TODO: 18S bounds are untested and should be empirically determined + stats18S.setStop(kStopRNA, 12, 16); + } + + /*--------------------------------------------------------------*/ + /*---------------- Public Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public boolean process(String genomeFname, String gffFname){ +// fnames.add(ReadWrite.stripPath(genomeFname)); + numFiles++; + FileFormat fnaFile=FileFormat.testInput(genomeFname, FileFormat.FA, null, true, true); + FileFormat gffFile=FileFormat.testInput(gffFname, FileFormat.GFF, null, true, true); + + if(fnaFile==null || gffFile==null){ + errorState=true; + return true; + } + filesProcessed++; + + ArrayList<ScafData> scafList; + {//Scoped to save memory + ArrayList<Read> reads=ReadInputStream.toReads(fnaFile, maxReads); + readsProcessed+=reads.size(); + scafList=new ArrayList<ScafData>(reads.size()); + for(Read r : reads){ + basesProcessed+=r.length(); + scafList.add(new ScafData(r)); + } + } + {//Scoped to save memory + ArrayList<GffLine>[] allGffLines=GffLine.loadGffFileByType(gffFile, "CDS,rRNA,tRNA", true); + ArrayList<GffLine> cds=allGffLines[0]; + ArrayList<GffLine> rrna=allGffLines[1]; + ArrayList<GffLine> trna=allGffLines[2]; + genesProcessed+=cds.size(); + genesProcessed+=(rrna==null ? 0 : rrna.size()); + genesProcessed+=(trna==null ? 0 : trna.size()); + + HashMap<String, ScafData> scafMap=makeScafMap(scafList); + fillScafDataCDS(cds, scafMap); + fillScafDataRNA(rrna, scafMap); + fillScafDataRNA(trna, scafMap); + } + + countBases(scafList); + if(PROCESS_PLUS_STRAND){ + processStrand(scafList, Shared.PLUS); + } + if(PROCESS_MINUS_STRAND){ + for(ScafData sd : scafList){ + sd.clear(); + sd.reverseComplement(); + } + processStrand(scafList, Shared.MINUS); + for(ScafData sd : scafList){ + sd.clear(); + sd.reverseComplement(); + } + } + return false; + } + + public void add(GeneModel pgm){ + for(int i=0; i<allContainers.length; i++){ +// System.err.println("merging "+allContainers[i].name); + allContainers[i].add(pgm.allContainers[i]); + } + + readsProcessed+=pgm.readsProcessed; + basesProcessed+=pgm.basesProcessed; + genesProcessed+=pgm.genesProcessed; + filesProcessed+=pgm.filesProcessed; + +// geneStartsProcessed+=pgm.geneStartsProcessed; +// tRNAProcessed+=pgm.tRNAProcessed; +// r16SProcessed+=pgm.r16SProcessed; +// r23SProcessed+=pgm.r23SProcessed; +// r5SProcessed+=pgm.r5SProcessed; +// r18SProcessed+=pgm.r18SProcessed; + +// fnames.addAll(pgm.fnames); + numFiles+=pgm.numFiles; + taxIds.addAll(pgm.taxIds); + Tools.add(baseCounts, pgm.baseCounts); + } + + public void multiplyBy(double mult) { + for(int i=0; i<allContainers.length; i++){ + allContainers[i].multiplyBy(mult); + } + + readsProcessed=Math.round(readsProcessed*mult); + basesProcessed=Math.round(basesProcessed*mult); + genesProcessed=Math.round(genesProcessed*mult); + filesProcessed=Math.round(filesProcessed*mult); + + for(int i=0; i<baseCounts.length; i++){ + baseCounts[i]=Math.round(baseCounts[i]*mult); + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public float gc(){ + long a=baseCounts[0]; + long c=baseCounts[1]; + long g=baseCounts[2]; + long t=baseCounts[3]; + return (float)((g+c)/Tools.max(1.0, a+t+g+c)); + } + + HashMap<String, ScafData> makeScafMap(ArrayList<ScafData> scafList){ + HashMap<String, ScafData> scafMap=new HashMap<String, ScafData>(scafList.size()*3); + for(ScafData sd : scafList){scafMap.put(sd.name, sd);} + for(ScafData sd : scafList){ + String name=sd.name; + int idx=name.indexOf(' '); + if(idx>=0){ + String prefix=name.substring(0, idx); + if(scafMap.containsKey(prefix)){ + assert(false) : "Duplicate degenerate name: '"+name+"', '"+prefix+"'"; + }else{ + scafMap.put(prefix, sd); + } + } + } + return scafMap; + } + + public void fillScafDataCDS(ArrayList<GffLine> cdsLines, HashMap<String, ScafData> scafMap){ + if(!callCDS){return;} + for(GffLine gline : cdsLines){ + ScafData sd=scafMap.get(gline.seqid); + assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid; + sd.addCDS(gline); + } + } + + public void fillScafDataRNA(ArrayList<GffLine> rnaLines, HashMap<String, ScafData> scafMap){ + for(GffLine gline : rnaLines){ + ScafData sd=scafMap.get(gline.seqid); + assert(sd!=null) : "Can't find scaffold for GffLine "+gline.seqid; + if(processType(gline.prokType())){ + sd.addRNA(gline); + } + } + } + + public void processStrand(ArrayList<ScafData> scafList, int strand){ + for(ScafData sd : scafList){ + processCDS(sd, strand); + processRNA(sd, strand); + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + private void countBases(ArrayList<ScafData> scafList){ + for(ScafData sd : scafList){ + countBases(sd.bases); + } + } + + private void countBases(byte[] bases){ + for(byte b : bases){ + int x=AminoAcid.baseToNumberACGTother[b]; + baseCounts[x]++; + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Finding Codons ----------------*/ + /*--------------------------------------------------------------*/ + + private static void findStopCodons(byte[] bases, IntList list, BitSet valid){ + final int k=3; + final int mask=~((-1)<<(2*k)); + int kmer=0; + int len=0; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + if(x>=0){ + len++; + if(len>=k){ + int point=i;//End of the stop codon + if(isStopCodon(kmer) && !valid.get(point)){ + list.add(point); + valid.set(point); + } + } + }else{len=0;} + } + + for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise + if(!valid.get(i)){ + list.add(i); + } + } + } + + private static void findStartCodons(byte[] bases, IntList list, BitSet valid){ + final int k=3; + final int mask=~((-1)<<(2*k)); + int kmer=0; + int len=0; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + if(x>=0){ + len++; + if(len>=k){ + int point=i-k+1;//Start of the start codon + if(isStartCodon(kmer) && !valid.get(point)){ + list.add(point); + valid.set(point); + } + } + }else{len=0;} + } + + for(int i=50; i<bases.length-3; i+=2000){//Add some non-canonical sites, aka noise + if(!valid.get(i)){ + list.add(i); + } + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Processing GffLines ----------------*/ + /*--------------------------------------------------------------*/ + + private static void processGene(GffLine gline, ScafData sd){ + if(gline.length()<2){return;} + final int strand=gline.strand; + assert(strand==sd.strand()); + final byte[] frames=sd.frames; + int start=gline.start-1, stop=gline.stop-1; + if(start<0 || stop>=sd.length()){return;} +// assert(start<stop) : gline; //Not always true for euks... + if(strand==Shared.MINUS){ + int x=sd.length()-start-1; + int y=sd.length()-stop-1; + start=y; + stop=x; + +// String a=new String(sd.bases, start, 3); +// String b=new String(sd.bases, stop-2, 3); +//// assert(false) : start+", "+stop+"\n"+gline+"\n"+new String(sd.bases, start, 3)+", "+new String(sd.bases, stop-2, 3); +// outstream.println(a+", "+b+", "+start+", "+stop); + } + assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name; + markFrames(start, stop, frames, kInnerCDS); + sd.starts.add(start); + sd.stops.add(stop); +// assert(gline.start!=337) : gline+"\n"+start+", "+stop; + } + + private boolean processRnaLine(final GffLine gline, final ScafData sd, final int type){ + final int strand=gline.strand; + assert(strand==sd.strand()); + final byte[] frames=sd.frames; + int start=gline.start-1, stop=gline.stop-1; + if(start<0 || stop>=sd.length()){return false;} + assert(start<=stop); + if(strand==Shared.MINUS){ + int x=sd.length()-start-1; + int y=sd.length()-stop-1; + start=y; + stop=x; + } + + if(AnalyzeGenes.alignRibo){ +// byte[] seq=sd.fetch(start, stop); + Read[] consensusReads=ProkObject.consensusReads(type); + byte[] universal=(consensusReads!=null && consensusReads.length>0 ? consensusReads[0].bases : null); + float minIdentity=ProkObject.minID(type); + if(universal!=null){ + int[] coords=KillSwitch.allocInt1D(2); + final int a=Tools.max(0, start-(AnalyzeGenes.adjustEndpoints ? 200 : 50)); + final int b=Tools.min(sd.bases.length-1, stop+(AnalyzeGenes.adjustEndpoints ? 200 : 50)); + float id1=align(universal, sd.bases, a, b, minIdentity, coords); + final int rstart=coords[0], rstop=coords[1]; +// assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop; + if(id1<minIdentity){ +// System.err.println("Low identity: "+String.format("%.2s", 100*id1)); + return false; + }else{ +// System.err.println("Good identity: "+String.format("%.2s", 100*id1)); + } + if(AnalyzeGenes.adjustEndpoints){ + int startSlop=startSlop(type); + int stopSlop=stopSlop(type); + if(Tools.absdif(start, rstart)>startSlop){ +// System.err.println("rstart:\t"+start+" -> "+rstart); + start=rstart; + } + if(Tools.absdif(stop, rstop)>stopSlop){ +// System.err.println("rstop: \t"+stop+" -> "+rstop); + stop=rstop; + } + } + } + } + + StatsContainer sc=allContainers[type]; + sc.start.processPoint(sd.bases, start, 1); + sc.stop.processPoint(sd.bases, stop, 1); + assert(sc!=statsCDS); + + assert(start>=0) : gline.toString()+"\n"+sd.length()+"\n"+sd.name; + final byte flag=typeToFlag(type); + for(int i=start+kInnerRNA-1; i<=stop; i++){ + frames[i]|=flag; + } + return true; + } + + private float align(byte[] query, byte[] ref, int start, int stop, float minIdentity, int[] coords){ +// final int a=0, b=ref.length-1; + SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); + final int minScore=ssa.minScoreByIdentity(query.length, minIdentity); + int[] max=ssa.fillUnlimited(query, ref, start, stop, minScore); + if(max==null){return 0;} + + final int rows=max[0]; + final int maxCol=max[1]; + final int maxState=max[2]; +// final int maxScore=max[3]; +// final int maxStart=max[4]; + + //returns {score, bestRefStart, bestRefStop} + //padded: {score, bestRefStart, bestRefStop, padLeft, padRight}; + final int[] score=ssa.score(query, ref, start, stop, rows, maxCol, maxState); + final int rstart=Tools.max(score[1], 0); + final int rstop=Tools.min(score[2], ref.length-1); + if(coords!=null){ + coords[0]=rstart; + coords[1]=rstop; + } + final float id=ssa.tracebackIdentity(query, ref, start, stop, rows, maxCol, maxState, null); + return id; + } + + /** + * Each frame byte has a bit marked for valid coding frames. + * For example, if frames[23]=0b100, then base 23 is the last base in a kmer starting at the 3rd base in a codon. + * If frames[23]=0, then no coding kmer end at that location on this strand. + * @param start + * @param stop + * @param frames + * @param k + */ + private static void markFrames(int start, int stop, byte[] frames, int k){ + assert(start<=stop) : start+", "+stop; + for(int i=start+k-1, frameBit=(1<<((k-1)%3)), max=Tools.min(stop-3, frames.length-1); i<=max; i++){ + frames[i]=(byte)(frames[i]|frameBit); + frameBit<<=1; + if(frameBit>4){frameBit=1;} + } +// assert(false) : Arrays.toString(Arrays.copyOfRange(frames, start, start+20))+"\n"+start; //This is correct + } + + /*--------------------------------------------------------------*/ + /*---------------- Counting Kmers ----------------*/ + /*--------------------------------------------------------------*/ + + private void processCDS(ScafData sd, int strand){ + if(!callCDS){return;} + ArrayList<GffLine> glines=sd.cdsLines[strand]; + for(GffLine gline : glines){ + assert(gline.strand==strand); + processGene(gline, sd); + statsCDS.addLength(gline.length()); + } + + statsCDS.inner.processCDSFrames(sd.bases, sd.frames); + BitSet startSet=processEnds(sd.bases, statsCDS.start, sd.starts, 1); + BitSet stopSet=processEnds(sd.bases, statsCDS.stop, sd.stops, 1); +// outstream.println("Processed "+sd.starts.size+" valid starts and "+sd.stops.size+" stops."); + sd.clear(); + findStartCodons(sd.bases, sd.starts, startSet); + findStopCodons(sd.bases, sd.stops, stopSet); +// outstream.println("Found "+sd.starts.size+" invalid starts and "+sd.stops.size+" stops."); + processEnds(sd.bases, statsCDS.start, sd.starts, 0); + processEnds(sd.bases, statsCDS.stop, sd.stops, 0); + } + + private static int getGlineType(GffLine gline, ScafData sd){ + if(!gline.inbounds(sd.length()) || gline.partial()){return -1;} + + final int length=gline.length(); + final int type=gline.prokType(); + if(type<0){ + return type; + }else if(type==CDS){ + return type; + }else if(type==tRNA && length>=40 && length<=120){ + return type; + }else if(type==r16S && length>=1440 && length<=1620){ + return type; + }else if(type==r23S && length>=2720 && length<=3170){ + return type; + }else if(type==r5S && length>=90 && length<=150){ + return type; + }else if(type==r18S && length>=1400 && length<=2000){ //TODO: Check length range + return type; + } + return -1; + } + + private void processRNA(ScafData sd, int strand){ + sd.clear(); + ArrayList<GffLine> lines=sd.rnaLines[strand]; + for(GffLine gline : lines){ + assert(gline.strand==strand); + final int type=getGlineType(gline, sd); + if(type>0){ + StatsContainer sc=allContainers[type]; + sc.addLength(gline.length()); + processRnaLine(gline, sd, type); + } + } + processRnaInner(sd); + processRnaEnds(sd); + } + + void processRnaInner(ScafData sd){ + byte[] bases=sd.bases; + byte[] frames=sd.frames; + final int k=kInnerRNA;//TODO: Note! This is linked to a single static variable for all RNAs. + final int mask=~((-1)<<(2*k)); + int kmer=0; + int len=0; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + if(x>=0){ + len++; + if(len>=k){ + int vf=frames[i]; + for(int type=0; type<5; type++){ + int valid=vf&1; + rnaContainers[type].inner.add(kmer, 0, valid); + vf=(vf>>1); + } + } + }else{len=0;} + } + } + + void processRnaEnds(ScafData sd){ + byte[] bases=sd.bases; + + final int k=stats16S.start.k; + final int kMax=stats16S.start.kMax; + final int mask=stats16S.start.mask; + final long[] counts=new long[kMax];//TODO: Slow + + int kmer=0; + int len=0; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + int x=AminoAcid.baseToNumber[b]; + kmer=((kmer<<2)|x)&mask; + + if(x>=0){ + len++; + if(len>=k){ + counts[kmer]++; + } + }else{len=0;} + } + for(StatsContainer sc : rnaContainers){ + FrameStats fs=sc.start; + for(long[] array : fs.countsFalse){ + Tools.add(array, counts); + } + fs=sc.stop; + for(long[] array : fs.countsFalse){ + Tools.add(array, counts); + } + } + } + + private static BitSet processEnds(byte[] bases, FrameStats stats, IntList list, int valid){ + BitSet points=new BitSet(bases.length); + for(int i=0; i<list.size; i++){ + int point=list.get(i); + stats.processPoint(bases, list.get(i), valid); + points.set(point); + } + return points; + } + + /*--------------------------------------------------------------*/ + /*---------------- Scoring ----------------*/ + /*--------------------------------------------------------------*/ + +// //Assumes bases are in the correct strand +// public float calcStartScore(int start, int stop, byte[] bases){ +// float f=scorePoint(start, bases, startStats); +//// float ss=scoreStart2(start, bases, stop, innerKmerStats); +//// if(ss>0){f=(f+0.0005f*ss);} //Does not seem to help; needs more study. +// return f; +// } +// +// //Assumes bases are in the correct strand +// public float calcStopScore(int stop, byte[] bases){ +// float f=scorePoint(stop, bases, stopStats); +// return f; +// } +// +// //Assumes bases are in the correct strand +// public float calcRnaStartScore(int start, int stop, byte[] bases){ +// float f=scorePoint(start, bases, rrnaStartStats); +// return f; +// } +// +// //Assumes bases are in the correct strand +// public float calcRnaStopScore(int stop, byte[] bases){ +// float f=scorePoint(stop, bases, rrnaStopStats); +// return f; +// } + +// public static float calcKmerScore(int start, int stop, int startFrame, byte[] bases, FrameStats stats){ +// +// assert(stats.frames==3); +// final int k=stats.k; +// final int mask=~((-1)<<(2*k)); +// +// int kmer=0; +// int len=0; +// float score=0; +// int numKmers=0; +// +// for(int pos=start, currentFrame=startFrame; pos<stop; pos++){ +// final byte b=bases[pos]; +// final int x=AminoAcid.baseToNumber[b]; +// +// if(x>=0){ +// kmer=((kmer<<2)|x)&mask; +// len++; +// if(len>=k){ +// float prob=stats.probs[currentFrame][kmer]; +// float dif=prob-0.99f;//Prob above 1 is more likely than average +// score+=dif; +// numKmers++; +// } +// }else{ +// len=0; +// kmer=0; +// } +// +// currentFrame++; +// if(currentFrame>2){currentFrame=0;} +// } +// return score/Tools.max(1f, numKmers); +// } +// +// /** +// * TODO +// * Evaluate the relative difference between left and right frequencies. +// * The purpose is to find locations where the left side looks noncoding and the right side looks coding. +// * Does not currently yield useful results. +// */ +// public static float scoreStart2(int point, byte[] bases, int stop, FrameStats stats){ +// final int k=stats.k; +// +// int start=point-45; +// if(start<0 || stop>bases.length){return 0.5f;} +// +// float left=calcKmerScore(start, Tools.min(point+k-2, bases.length), 0, bases, stats); +// float right=calcKmerScore(point, stop-3, 0, bases, stats); +// return right-left; //High numbers are likely to be starts; non-starts should be near 0. +// } + + /*--------------------------------------------------------------*/ + /*---------------- toString ----------------*/ + /*--------------------------------------------------------------*/ + + @Override + public String toString(){ + return appendTo(new ByteBuilder()).toString(); + } + + public ByteBuilder appendTo(ByteBuilder bb){ + +// Collections.sort(fnames); + taxIds.sort(); + + bb.append("#BBMap "+Shared.BBMAP_VERSION_STRING+" Prokaryotic Gene Model\n"); + bb.append("#files"); + bb.tab().append(numFiles); +// if(fnames.size()>5){ +// bb.tab().append(fnames.size()); +// }else{ +// for(String fname : fnames){ +// bb.tab().append(fname); +// } +// } + bb.nl(); + bb.append("#taxIDs"); + for(int i=0; i<taxIds.size; i++){ + bb.tab().append(taxIds.get(i)); + } + bb.nl(); +// bb.append("#k_inner\t").append(innerKmerLength).nl(); +// bb.append("#k_end\t").append(endKmerLength).nl(); +// bb.append("#start_left_offset\t").append(startLeftOffset).nl(); +// bb.append("#start_right_offset\t").append(startRightOffset).nl(); +// bb.append("#stop_left_offset\t").append(stopLeftOffset).nl(); +// bb.append("#stop_right_offset\t").append(stopRightOffset).nl(); + bb.append("#scaffolds\t").append(readsProcessed).nl(); + bb.append("#bases\t").append(basesProcessed).nl(); + bb.append("#genes\t").append(genesProcessed).nl(); + bb.append("#GC\t").append(gc(),2).nl(); + bb.append("#ACGTN"); + for(long x : baseCounts){ + bb.tab().append(x); + } + bb.nl(); + + for(StatsContainer sc : allContainers){ + sc.appendTo(bb); + } + assert(allContainers.length>5) : allContainers.length; + + return bb; + } + + /*--------------------------------------------------------------*/ + /*---------------- Stats ----------------*/ + /*--------------------------------------------------------------*/ + + public final StatsContainer statsCDS=new StatsContainer(CDS); + public final StatsContainer statstRNA=new StatsContainer(tRNA); + public final StatsContainer stats16S=new StatsContainer(r16S); + public final StatsContainer stats23S=new StatsContainer(r23S); + public final StatsContainer stats5S=new StatsContainer(r5S); + public final StatsContainer stats18S=new StatsContainer(r18S); + + final StatsContainer[] rnaContainers=new StatsContainer[] {statstRNA, stats16S, stats23S, stats5S, stats18S}; + final StatsContainer[] allContainers=new StatsContainer[] {statsCDS, statstRNA, stats16S, stats23S, stats5S, stats18S}; + //public static int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; + +// public final FrameStats innerKmerStats=new FrameStats("innerKmerStats", innerKmerLength, 3, 0); +// public final FrameStats startStats=new FrameStats("startStats", endKmerLength, startFrames, startLeftOffset); +// public final FrameStats stopStats=new FrameStats("stopStats", endKmerLength, stopFrames, stopLeftOffset); +// +// public final FrameStats rrnaStartStats=new FrameStats("rrnaStart", 2, 16, 8); +// public final FrameStats rrnaStopStats=new FrameStats("rrnaStop", 2, 16, 8); +// +// public final FrameStats trnaStats=new FrameStats("tRNA", rnaKmerLength, 1, 0); +// public final FrameStats rrna16Sstats=new FrameStats("16S", rnaKmerLength, 1, 0); +// public final FrameStats rrna23Sstats=new FrameStats("23S", rnaKmerLength, 1, 0); +// public final FrameStats rrna5Sstats=new FrameStats("5S", rnaKmerLength, 1, 0); +// public final FrameStats[] rnaKmerStats=new FrameStats[] {trnaStats, rrna16Sstats, rrna23Sstats, rrna5Sstats}; + + /*--------------------------------------------------------------*/ + +// public ArrayList<String> fnames=new ArrayList<String>(); + public int numFiles=0; + public IntList taxIds=new IntList(); + + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private long maxReads=-1; + + long readsProcessed=0; + long basesProcessed=0; + long genesProcessed=0; + long filesProcessed=0; + long[] baseCounts=new long[5]; + + /*--------------------------------------------------------------*/ + /*---------------- Static Setters ----------------*/ + /*--------------------------------------------------------------*/ + + public synchronized void setStatics(){ +// assert(!setStatics); + kInnerCDS=statsCDS.inner.k; + kStartCDS=statsCDS.start.k; + kStopCDS=statsCDS.stop.k; + + setStartLeftOffset(statsCDS.start.leftOffset); + setStartRightOffset(statsCDS.start.rightOffset()); + + setStopLeftOffset(statsCDS.stop.leftOffset); + setStopRightOffset(statsCDS.stop.rightOffset()); + + kInnerRNA=stats16S.inner.k;//TODO: Why is 16S used here? + kStartRNA=stats16S.start.k; + kStopRNA=stats16S.stop.k; + setStatics=true; + } + + public static void setInnerK(int k){ + kInnerCDS=k; + } + + public static void setStartK(int k){ + kStartCDS=k; + } + + public static void setStopK(int k){ + kStopCDS=k; + } + + public static void setStartLeftOffset(int x){ + startLeftOffset=x; + startFrames=startLeftOffset+startRightOffset+1; +// System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames); + } + + public static void setStartRightOffset(int x){ + startRightOffset=x; + startFrames=startLeftOffset+startRightOffset+1; +// System.err.println("startLeftOffset="+startLeftOffset+", startRightOffset="+startRightOffset+", frames="+startFrames); +// assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames; + } + + public static void setStopLeftOffset(int x){ + stopLeftOffset=x; + stopFrames=stopLeftOffset+stopRightOffset+1; +// System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames); + } + + public static void setStopRightOffset(int x){ + stopRightOffset=x; + stopFrames=stopLeftOffset+stopRightOffset+1; +// System.err.println("stopLeftOffset="+stopLeftOffset+", stopRightOffset="+stopRightOffset+", frames="+stopFrames); +// assert(false) : endLeftOffset+", "+endRightOffset+", "+endFrames; + } + + public static final boolean isStartCodon(int code){ + return code>=0 && code<=63 && isStartCodon[code]; + } + public static final boolean isStopCodon(int code){ + return code>=0 && code<=63 && isStopCodon[code]; + } + + /*--------------------------------------------------------------*/ + /*---------------- Class Init ----------------*/ + /*--------------------------------------------------------------*/ + + private static boolean[] makeIsCodon(String[] codons){ + boolean[] array=new boolean[64]; + for(String s : codons){ + int x=AminoAcid.toNumber(s); + array[x]=true; + } + return array; + } + + public static int kInnerCDS=6; + public static int kStartCDS=3; + public static int kStopCDS=3; + + static int startLeftOffset(){return startLeftOffset;} + static int startRightOffset(){return startRightOffset;} + static int startFrames(){return startFrames;} + + private static int startLeftOffset=21; //21 works well for k=4 + private static int startRightOffset=8; //10 works well for k=4 + private static int startFrames=startLeftOffset+startRightOffset+1; + + private static int stopLeftOffset=9; + private static int stopRightOffset=12; + private static int stopFrames=stopLeftOffset+stopRightOffset+1; + + private static boolean setStatics=false; + + /*--------------------------------------------------------------*/ + /*---------------- More Statics ----------------*/ + /*--------------------------------------------------------------*/ + + //E. coli uses 83% AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] and one or two others (e.g., an AUU and possibly a CUG).[8][9] + public static String[] startCodons=new String[] {"ATG", "GTG", "TTG"}; + public static String[] extendedStartCodons=new String[] {"ATG", "GTG", "TTG", "ATT", "CTG", "ATA"}; + public static String[] stopCodons=new String[] {"TAG", "TAA", "TGA"}; + public static boolean[] isStartCodon=makeIsCodon(startCodons); + public static boolean[] isStopCodon=makeIsCodon(stopCodons); + + /*--------------------------------------------------------------*/ + + private static PrintStream outstream=System.err; + public static boolean verbose=false; + public static boolean errorState=false; + +}