jpayne@68: package prok; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.io.PrintStream; jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.Arrays; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import dna.Data; jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.ByteStreamWriter; jpayne@68: import fileIO.FileFormat; jpayne@68: import fileIO.ReadWrite; jpayne@68: import gff.CompareGff; jpayne@68: import jgi.BBMerge; jpayne@68: import json.JsonObject; jpayne@68: import shared.Parse; jpayne@68: import shared.Parser; jpayne@68: import shared.PreParser; jpayne@68: import shared.ReadStats; jpayne@68: import shared.Shared; jpayne@68: import shared.Timer; jpayne@68: import shared.Tools; jpayne@68: import stream.ConcurrentReadInputStream; jpayne@68: import stream.ConcurrentReadOutputStream; jpayne@68: import stream.Read; jpayne@68: import structures.ByteBuilder; jpayne@68: import structures.ListNum; jpayne@68: jpayne@68: /** jpayne@68: * This is the executable class for gene-calling. jpayne@68: * @author Brian Bushnell jpayne@68: * @date Sep 24, 2018 jpayne@68: * jpayne@68: */ jpayne@68: public class CallGenes extends ProkObject { jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Initialization ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Code entrance from the command line. jpayne@68: * @param args Command line arguments jpayne@68: */ jpayne@68: public static void main(String[] args){ jpayne@68: //Start a timer immediately upon code entrance. jpayne@68: Timer t=new Timer(); jpayne@68: jpayne@68: //Create an instance of this class jpayne@68: CallGenes x=new CallGenes(args); jpayne@68: jpayne@68: //Run the object jpayne@68: x.process(t); jpayne@68: jpayne@68: //Close the print stream if it was redirected jpayne@68: Shared.closeStream(x.outstream); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Constructor. jpayne@68: * @param args Command line arguments jpayne@68: */ jpayne@68: public CallGenes(String[] args){ jpayne@68: jpayne@68: {//Preparse block for help, config files, and outstream jpayne@68: PreParser pp=new PreParser(args, (args.length>40 ? null : getClass()), false); jpayne@68: args=pp.args; jpayne@68: outstream=pp.outstream; jpayne@68: } jpayne@68: jpayne@68: //Set shared static variables prior to parsing jpayne@68: ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; jpayne@68: ReadWrite.MAX_ZIP_THREADS=Shared.threads(); jpayne@68: jpayne@68: {//Parse the arguments jpayne@68: final Parser parser=parse(args); jpayne@68: overwrite=parser.overwrite; jpayne@68: append=parser.append; jpayne@68: jpayne@68: outGff=parser.out1; jpayne@68: maxReads=parser.maxReads; jpayne@68: } jpayne@68: jpayne@68: fixExtensions(); //Add or remove .gz or .bz2 as needed jpayne@68: checkFileExistence(); //Ensure files can be read and written jpayne@68: checkStatics(); //Adjust file-related static fields as needed for this program jpayne@68: jpayne@68: ffoutGff=FileFormat.testOutput(outGff, FileFormat.GFF, null, true, overwrite, append, ordered); jpayne@68: ffoutAmino=FileFormat.testOutput(outAmino, FileFormat.FA, null, true, overwrite, append, ordered); jpayne@68: ffout16S=FileFormat.testOutput(out16S, FileFormat.FA, null, true, overwrite, append, ordered); jpayne@68: ffout18S=FileFormat.testOutput(out18S, FileFormat.FA, null, true, overwrite, append, ordered); jpayne@68: jpayne@68: if(ffoutGff!=null){ jpayne@68: assert(!ffoutGff.isSequence()) : "\nout is for gff files. To output sequence, please use outa."; jpayne@68: } jpayne@68: if(ffoutAmino!=null){ jpayne@68: assert(!ffoutAmino.gff()) : "\nouta is for sequence data. To output gff, please use out."; jpayne@68: } jpayne@68: if(ffout16S!=null){ jpayne@68: assert(!ffout16S.gff()) : "\nout16S is for sequence data. To output gff, please use out."; jpayne@68: } jpayne@68: if(ffout18S!=null){ jpayne@68: assert(!ffout18S.gff()) : "\nout18S is for sequence data. To output gff, please use out."; jpayne@68: } jpayne@68: jpayne@68: if(geneHistFile==null){geneHistBins=0;} jpayne@68: else{ jpayne@68: assert(geneHistBins>1) : "geneHistBins="+geneHistBins+"; should be >1"; jpayne@68: assert(geneHistDiv>=1) : "geneHistDiv="+geneHistDiv+"; should be >=1"; jpayne@68: } jpayne@68: geneHist=geneHistBins>1 ? new long[geneHistBins] : null; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Initialization Helpers ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Parse arguments from the command line */ jpayne@68: private Parser parse(String[] args){ jpayne@68: jpayne@68: Parser parser=new Parser(); jpayne@68: for(int i=0; i1 ? split[1] : null; jpayne@68: if(b!=null && b.equalsIgnoreCase("null")){b=null;} jpayne@68: jpayne@68: // outstream.println(arg+", "+a+", "+b); jpayne@68: if(PGMTools.parseStatic(arg, a, b)){ jpayne@68: //do nothing jpayne@68: }else if(a.equals("in") || a.equals("infna") || a.equals("fnain") || a.equals("fna")){ jpayne@68: assert(b!=null); jpayne@68: Tools.addFiles(b, fnaList); jpayne@68: }else if(b==null && new File(arg).exists() && FileFormat.isFastaFile(arg)){ jpayne@68: fnaList.add(arg); jpayne@68: }else if(a.equals("pgm") || a.equals("gm") || a.equals("model")){ jpayne@68: assert(b!=null); jpayne@68: if(b.equalsIgnoreCase("auto") || b.equalsIgnoreCase("default")){ jpayne@68: b=Data.findPath("?model.pgm"); jpayne@68: pgmList.add(b); jpayne@68: }else{ jpayne@68: Tools.addFiles(b, pgmList); jpayne@68: } jpayne@68: }else if(b==null && new File(arg).exists() && FileFormat.isPgmFile(arg)){ jpayne@68: Tools.addFiles(arg, pgmList); jpayne@68: }else if(a.equals("outamino") || a.equals("aminoout") || a.equals("outa") || a.equals("outaa") || a.equals("aaout") || a.equals("amino")){ jpayne@68: outAmino=b; jpayne@68: }else if(a.equalsIgnoreCase("out16s") || a.equalsIgnoreCase("16sout")){ jpayne@68: out16S=b; jpayne@68: }else if(a.equalsIgnoreCase("out18s") || a.equalsIgnoreCase("18sout")){ jpayne@68: out18S=b; jpayne@68: }else if(a.equals("verbose")){ jpayne@68: verbose=Parse.parseBoolean(b); jpayne@68: //ReadWrite.verbose=verbose; jpayne@68: GeneCaller.verbose=verbose; jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("json_out") || a.equalsIgnoreCase("json")){ jpayne@68: json_out=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("stats") || a.equalsIgnoreCase("outstats")){ jpayne@68: outStats=b; jpayne@68: }else if(a.equals("hist") || a.equalsIgnoreCase("outhist") || a.equalsIgnoreCase("lengthhist") || a.equalsIgnoreCase("lhist") || a.equalsIgnoreCase("genehist")){ jpayne@68: geneHistFile=b; jpayne@68: }else if(a.equals("bins")){ jpayne@68: geneHistBins=Integer.parseInt(b); jpayne@68: }else if(a.equals("binlen") || a.equals("binlength") || a.equals("histdiv")){ jpayne@68: geneHistBins=Integer.parseInt(b); jpayne@68: }else if(a.equals("printzero") || a.equals("pz")){ jpayne@68: printZeroCountHist=Parse.parseBoolean(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("merge")){ jpayne@68: merge=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("ecco")){ jpayne@68: ecco=Parse.parseBoolean(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("setbias16s")) { jpayne@68: GeneCaller.biases[r16S]=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("setbias18s")) { jpayne@68: GeneCaller.biases[r18S]=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("setbias23s")) { jpayne@68: GeneCaller.biases[r23S]=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("setbias5s")) { jpayne@68: GeneCaller.biases[r5S]=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("setbiastRNA")) { jpayne@68: GeneCaller.biases[tRNA]=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("setbiasCDS")) { jpayne@68: GeneCaller.biases[CDS]=Float.parseFloat(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("ordered")){ jpayne@68: ordered=Parse.parseBoolean(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("translate")){ jpayne@68: mode=TRANSLATE; jpayne@68: }else if(a.equals("retranslate") || a.equals("detranslate")){ jpayne@68: mode=RETRANSLATE; jpayne@68: }else if(a.equals("recode")){ jpayne@68: mode=RECODE; jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("minlen") || a.equals("minlength")){ jpayne@68: minLen=Integer.parseInt(b); jpayne@68: }else if(a.equals("maxoverlapss") || a.equals("overlapss") || a.equals("overlapsamestrand") || a.equals("moss") || a.equalsIgnoreCase("maxOverlapSameStrand")){ jpayne@68: maxOverlapSameStrand=Integer.parseInt(b); jpayne@68: }else if(a.equals("maxoverlapos") || a.equals("overlapos") || a.equals("overlapoppositestrand") || a.equals("moos") || a.equalsIgnoreCase("maxOverlapOppositeStrand")){ jpayne@68: maxOverlapOppositeStrand=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("minStartScore")){ jpayne@68: minStartScore=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("minStopScore")){ jpayne@68: minStopScore=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("minInnerScore") || a.equalsIgnoreCase("minKmerScore")){ jpayne@68: minKmerScore=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("minOrfScore") || a.equalsIgnoreCase("minScore")){ jpayne@68: minOrfScore=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("minAvgScore")){ jpayne@68: minAvgScore=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("breakLimit")){ jpayne@68: GeneCaller.breakLimit=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("clearcutoffs") || a.equalsIgnoreCase("clearfilters")){ jpayne@68: GeneCaller.breakLimit=9999; jpayne@68: minOrfScore=-9999; jpayne@68: minAvgScore=-9999; jpayne@68: minKmerScore=-9999; jpayne@68: minStopScore=-9999; jpayne@68: minStartScore=-9999; jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("e1")){ jpayne@68: Orf.e1=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("e2")){ jpayne@68: Orf.e2=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("e3")){ jpayne@68: Orf.e3=Float.parseFloat(b); jpayne@68: } jpayne@68: else if(a.equalsIgnoreCase("f1")){ jpayne@68: Orf.f1=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("f2")){ jpayne@68: Orf.f2=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("f3")){ jpayne@68: Orf.f3=Float.parseFloat(b); jpayne@68: } jpayne@68: else if(a.equalsIgnoreCase("p0")){ jpayne@68: GeneCaller.p0=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("p1")){ jpayne@68: GeneCaller.p1=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("p2")){ jpayne@68: GeneCaller.p2=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("p3")){ jpayne@68: GeneCaller.p3=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("p4")){ jpayne@68: GeneCaller.p4=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("p5")){ jpayne@68: GeneCaller.p5=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("p6")){ jpayne@68: GeneCaller.p6=Float.parseFloat(b); jpayne@68: } jpayne@68: else if(a.equalsIgnoreCase("q1")){ jpayne@68: GeneCaller.q1=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("q2")){ jpayne@68: GeneCaller.q2=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("q3")){ jpayne@68: GeneCaller.q3=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("q4")){ jpayne@68: GeneCaller.q4=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("q5")){ jpayne@68: GeneCaller.q5=Float.parseFloat(b); jpayne@68: } jpayne@68: else if(a.equalsIgnoreCase("lookback")){ jpayne@68: GeneCaller.lookbackPlus=GeneCaller.lookbackMinus=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("lookbackplus")){ jpayne@68: GeneCaller.lookbackPlus=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("lookbackminus")){ jpayne@68: GeneCaller.lookbackMinus=Integer.parseInt(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("compareto")){ jpayne@68: compareToGff=b; jpayne@68: } jpayne@68: jpayne@68: else if(ProkObject.parse(arg, a, b)){} jpayne@68: jpayne@68: else if(parser.parse(arg, a, b)){ jpayne@68: //do nothing jpayne@68: }else{ jpayne@68: outstream.println("Unknown parameter "+args[i]); jpayne@68: assert(false) : "Unknown parameter "+args[i]; jpayne@68: // throw new RuntimeException("Unknown parameter "+args[i]); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(pgmList.isEmpty()){ jpayne@68: String b=Data.findPath("?model.pgm"); jpayne@68: pgmList.add(b); jpayne@68: } jpayne@68: for(int i=0; i foo=new ArrayList(); jpayne@68: foo.addAll(fnaList); jpayne@68: foo.addAll(pgmList); jpayne@68: if(!Tools.testInputFiles(false, true, foo.toArray(new String[0]))){ jpayne@68: throw new RuntimeException("\nCan't read some input files.\n"); jpayne@68: } jpayne@68: jpayne@68: //Ensure that no file was specified multiple times jpayne@68: foo.add(outGff); jpayne@68: foo.add(outAmino); jpayne@68: foo.add(out16S); jpayne@68: foo.add(out18S); jpayne@68: foo.add(outStats); jpayne@68: foo.add(geneHistFile); jpayne@68: if(!Tools.testForDuplicateFiles(true, foo.toArray(new String[0]))){ jpayne@68: throw new RuntimeException("\nSome file names were specified multiple times.\n"); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /** Adjust file-related static fields as needed for this program */ jpayne@68: private static void checkStatics(){ jpayne@68: //Adjust the number of threads for input file reading jpayne@68: if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ jpayne@68: ByteFile.FORCE_MODE_BF2=true; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Outer Methods ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Create read streams and process all data */ jpayne@68: void process(Timer t){ jpayne@68: jpayne@68: GeneModel pgm=PGMTools.loadAndMerge(pgmList); jpayne@68: jpayne@68: if(call16S || call18S || call23S || calltRNA || call5S){ jpayne@68: loadLongKmers(); jpayne@68: loadConsensusSequenceFromFile(false, false); jpayne@68: } jpayne@68: jpayne@68: ByteStreamWriter bsw=makeBSW(ffoutGff); jpayne@68: if(bsw!=null){ jpayne@68: bsw.forcePrint("##gff-version 3\n"); jpayne@68: } jpayne@68: jpayne@68: ConcurrentReadOutputStream rosAmino=makeCros(ffoutAmino); jpayne@68: ConcurrentReadOutputStream ros16S=makeCros(ffout16S); jpayne@68: ConcurrentReadOutputStream ros18S=makeCros(ffout18S); jpayne@68: jpayne@68: //Turn off read validation in the input threads to increase speed jpayne@68: final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; jpayne@68: Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; jpayne@68: jpayne@68: //Reset counters jpayne@68: readsIn=genesOut=0; jpayne@68: basesIn=bytesOut=0; jpayne@68: jpayne@68: for(String fname : fnaList){ jpayne@68: //Create a read input stream jpayne@68: final ConcurrentReadInputStream cris=makeCris(fname); jpayne@68: jpayne@68: //Process the reads in separate threads jpayne@68: spawnThreads(cris, bsw, rosAmino, ros16S, ros18S, pgm); jpayne@68: jpayne@68: //Close the input stream jpayne@68: errorState|=ReadWrite.closeStream(cris); jpayne@68: } jpayne@68: jpayne@68: //Close the input stream jpayne@68: errorState|=ReadWrite.closeStreams(null, rosAmino, ros16S, ros18S); jpayne@68: jpayne@68: if(verbose){outstream.println("Finished; closing streams.");} jpayne@68: jpayne@68: //Write anything that was accumulated by ReadStats jpayne@68: errorState|=ReadStats.writeAll(); jpayne@68: //Close the output stream jpayne@68: if(bsw!=null){errorState|=bsw.poisonAndWait();} jpayne@68: jpayne@68: //Reset read validation jpayne@68: Read.VALIDATE_IN_CONSTRUCTOR=vic; jpayne@68: jpayne@68: //Report timing and results jpayne@68: t.stop(); jpayne@68: outstream.println(Tools.timeReadsBasesProcessed(t, readsIn, basesIn, 8)); jpayne@68: outstream.println(Tools.linesBytesOut(readsIn, basesIn, genesOut, bytesOut, 8, false)); jpayne@68: outstream.println(); jpayne@68: jpayne@68: if(json_out){ jpayne@68: printStatsJson(outStats); jpayne@68: }else{ jpayne@68: printStats(outStats); jpayne@68: } jpayne@68: jpayne@68: if(geneHistFile!=null){ jpayne@68: printHist(geneHistFile); jpayne@68: } jpayne@68: jpayne@68: //Throw an exception of there was an error in a thread jpayne@68: if(errorState){ jpayne@68: throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); jpayne@68: } jpayne@68: jpayne@68: if(compareToGff!=null){ jpayne@68: if(compareToGff.equals("auto")){ jpayne@68: compareToGff=fnaList.get(0).replace(".fna", ".gff"); jpayne@68: compareToGff=compareToGff.replace(".fa", ".gff"); jpayne@68: compareToGff=compareToGff.replace(".fasta", ".gff"); jpayne@68: } jpayne@68: CompareGff.main(new String[] {outGff, compareToGff}); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: private void printStats(String fname){ jpayne@68: if(fname==null){return;} jpayne@68: ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, false); jpayne@68: bsw.start(); jpayne@68: jpayne@68: if(callCDS){ jpayne@68: bsw.println("Gene Starts Made: \t "+Tools.padLeft(geneStartsMade, 12)); jpayne@68: bsw.println("Gene Stops Made: \t "+Tools.padLeft(geneStopsMade, 12)); jpayne@68: bsw.println("Gene Starts Retained: \t "+Tools.padLeft(geneStartsRetained, 12)); jpayne@68: bsw.println("Gene Stops Retained: \t "+Tools.padLeft(geneStopsRetained, 12)); jpayne@68: bsw.println("CDS Out: \t "+Tools.padLeft(geneStartsOut, 12)); jpayne@68: } jpayne@68: if(call16S){bsw.println("16S Out: \t "+Tools.padLeft(r16SOut, 12));} jpayne@68: if(call18S){bsw.println("18S Out: \t "+Tools.padLeft(r18SOut, 12));} jpayne@68: if(call23S){bsw.println("23S Out: \t "+Tools.padLeft(r23SOut, 12));} jpayne@68: if(call5S){bsw.println("5S Out: \t "+Tools.padLeft(r5SOut, 12));} jpayne@68: if(calltRNA){bsw.println("tRNA Out: \t "+Tools.padLeft(tRNAOut, 12));} jpayne@68: jpayne@68: if(callCDS){ jpayne@68: bsw.println(); jpayne@68: bsw.println("All ORF Stats:"); jpayne@68: bsw.print(stCds.toString()); jpayne@68: jpayne@68: bsw.println(); jpayne@68: bsw.println("Retained ORF Stats:"); jpayne@68: bsw.print(stCds2.toString()); jpayne@68: jpayne@68: bsw.println(); jpayne@68: bsw.println("Called ORF Stats:"); jpayne@68: stCdsPass.genomeSize=basesIn; jpayne@68: bsw.print(stCdsPass.toString()); jpayne@68: } jpayne@68: jpayne@68: if(call16S){ jpayne@68: bsw.println(); jpayne@68: bsw.println("Called 16S Stats:"); jpayne@68: bsw.print(st16s.toString()); jpayne@68: } jpayne@68: if(call23S){ jpayne@68: bsw.println(); jpayne@68: bsw.println("Called 23S Stats:"); jpayne@68: bsw.print(st23s.toString()); jpayne@68: } jpayne@68: if(call5S){ jpayne@68: bsw.println(); jpayne@68: bsw.println("Called 5S Stats:"); jpayne@68: bsw.print(st5s.toString()); jpayne@68: } jpayne@68: if(call18S){ jpayne@68: bsw.println(); jpayne@68: bsw.println("Called 18S Stats:"); jpayne@68: bsw.print(st18s.toString()); jpayne@68: } jpayne@68: bsw.poisonAndWait(); jpayne@68: } jpayne@68: jpayne@68: private void printStatsJson(String fname){ jpayne@68: if(fname==null){return;} jpayne@68: jpayne@68: JsonObject outer=new JsonObject(); jpayne@68: jpayne@68: { jpayne@68: JsonObject jo=new JsonObject(); jpayne@68: if(callCDS){ jpayne@68: jo.add("Gene Starts Made", geneStartsMade); jpayne@68: jo.add("Gene Stops Made", geneStopsMade); jpayne@68: jo.add("Gene Starts Retained", geneStartsRetained); jpayne@68: jo.add("Gene Stops Retained", geneStopsRetained); jpayne@68: jo.add("CDS Out", geneStartsOut); jpayne@68: } jpayne@68: if(call16S){jo.add("16S Out", r16SOut);} jpayne@68: if(call18S){jo.add("18S Out", r18SOut);} jpayne@68: if(call23S){jo.add("23S Out", r23SOut);} jpayne@68: if(call5S){jo.add("5S Out", r5SOut);} jpayne@68: if(calltRNA){jo.add("tRNA Out", tRNAOut);} jpayne@68: outer.add("Overall", jo); jpayne@68: } jpayne@68: jpayne@68: if(callCDS){ jpayne@68: outer.add("All ORF Stats", stCds.toJson()); jpayne@68: outer.add("Retained ORF Stats", stCds2.toJson()); jpayne@68: stCdsPass.genomeSize=basesIn; jpayne@68: outer.add("Called ORF Stats", stCdsPass.toJson()); jpayne@68: } jpayne@68: jpayne@68: if(call16S){outer.add("Called 16S Stats", st16s.toJson());} jpayne@68: if(call18S){outer.add("Called 18S Stats", st18s.toJson());} jpayne@68: if(call23S){outer.add("Called 23S Stats", st23s.toJson());} jpayne@68: if(call5S){outer.add("Called 5S Stats", st5s.toJson());} jpayne@68: if(calltRNA){outer.add("Called tRNA Stats", sttRNA.toJson());} jpayne@68: jpayne@68: jpayne@68: ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, false); jpayne@68: bsw.start(); jpayne@68: bsw.println(outer.toText()); jpayne@68: bsw.poisonAndWait(); jpayne@68: } jpayne@68: jpayne@68: private void printHist(String fname){ jpayne@68: if(fname==null || geneHist==null){return;} jpayne@68: ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, false); jpayne@68: bsw.start(); jpayne@68: long sum=Tools.sum(geneHist); jpayne@68: double mean=Tools.averageHistogram(geneHist)*geneHistDiv; jpayne@68: int median=Tools.medianHistogram(geneHist)*geneHistDiv; jpayne@68: double std=Tools.standardDeviationHistogram(geneHist)*geneHistDiv; jpayne@68: bsw.println("#Gene Length Histogram"); jpayne@68: bsw.print("#Genes:\t").println(sum); jpayne@68: bsw.print("#Mean:\t").println(mean, 4); jpayne@68: bsw.print("#Median:\t").println(median); jpayne@68: bsw.print("#STDDev:\t").println(std, 4); jpayne@68: bsw.print("#Length\tCount\n"); jpayne@68: long cum=0; jpayne@68: for(int i=0; i0 || printZeroCountHist){ jpayne@68: bsw.print(len).tab().println(count); jpayne@68: } jpayne@68: } jpayne@68: bsw.poisonAndWait(); jpayne@68: } jpayne@68: jpayne@68: private ConcurrentReadInputStream makeCris(String fname){ jpayne@68: FileFormat ffin=FileFormat.testInput(fname, FileFormat.FA, null, true, true); jpayne@68: ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ffin, null); jpayne@68: cris.start(); //Start the stream jpayne@68: if(verbose){outstream.println("Started cris");} jpayne@68: return cris; jpayne@68: } jpayne@68: jpayne@68: /** Spawn process threads */ jpayne@68: private void spawnThreads(final ConcurrentReadInputStream cris, final ByteStreamWriter bsw, jpayne@68: ConcurrentReadOutputStream rosAmino, ConcurrentReadOutputStream ros16S, ConcurrentReadOutputStream ros18S, GeneModel pgm){ jpayne@68: jpayne@68: //Do anything necessary prior to processing jpayne@68: jpayne@68: //Determine how many threads may be used jpayne@68: final int threads=Shared.threads(); jpayne@68: jpayne@68: //Fill a list with ProcessThreads jpayne@68: ArrayList alpt=new ArrayList(threads); jpayne@68: for(int i=0; i alpt){ jpayne@68: jpayne@68: //Wait for completion of all threads jpayne@68: boolean success=true; jpayne@68: for(ProcessThread pt : alpt){ jpayne@68: jpayne@68: //Wait until this thread has terminated jpayne@68: while(pt.getState()!=Thread.State.TERMINATED){ jpayne@68: try { jpayne@68: //Attempt a join operation jpayne@68: pt.join(); jpayne@68: } catch (InterruptedException e) { jpayne@68: //Potentially handle this, if it is expected to occur jpayne@68: e.printStackTrace(); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: //Accumulate per-thread statistics jpayne@68: readsIn+=pt.readsInT; jpayne@68: basesIn+=pt.basesInT; jpayne@68: genesOut+=pt.genesOutT; jpayne@68: bytesOut+=pt.bytesOutT; jpayne@68: jpayne@68: geneStopsMade+=pt.caller.geneStopsMade; jpayne@68: geneStartsMade+=pt.caller.geneStartsMade; jpayne@68: geneStartsRetained+=pt.caller.geneStartsRetained; jpayne@68: geneStopsRetained+=pt.caller.geneStopsRetained; jpayne@68: geneStartsOut+=pt.caller.geneStartsOut; jpayne@68: jpayne@68: r16SOut+=pt.caller.r16SOut; jpayne@68: r18SOut+=pt.caller.r18SOut; jpayne@68: r23SOut+=pt.caller.r23SOut; jpayne@68: r5SOut+=pt.caller.r5SOut; jpayne@68: tRNAOut+=pt.caller.tRNAOut; jpayne@68: jpayne@68: stCds.add(pt.caller.stCds); jpayne@68: stCds2.add(pt.caller.stCds2); jpayne@68: // stCdsPass.add(pt.caller.stCdsPass); jpayne@68: jpayne@68: for(int i=0; i1 ? new long[geneHistBins] : null); jpayne@68: caller=new GeneCaller(minLen, maxOverlapSameStrand, maxOverlapOppositeStrand, jpayne@68: minStartScore, minStopScore, minKmerScore, minOrfScore, minAvgScore, pgm); jpayne@68: } jpayne@68: jpayne@68: //Called by start() jpayne@68: @Override jpayne@68: public void run(){ jpayne@68: //Do anything necessary prior to processing jpayne@68: jpayne@68: //Process the reads jpayne@68: processInner(); jpayne@68: jpayne@68: //Do anything necessary after processing jpayne@68: jpayne@68: //Indicate successful exit status jpayne@68: success=true; jpayne@68: } jpayne@68: jpayne@68: /** Iterate through the reads */ jpayne@68: void processInner(){ jpayne@68: jpayne@68: //Grab the first ListNum of reads jpayne@68: ListNum ln=cris.nextList(); jpayne@68: jpayne@68: //Check to ensure pairing is as expected jpayne@68: if(ln!=null && !ln.isEmpty()){ jpayne@68: Read r=ln.get(0); jpayne@68: // assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access jpayne@68: } jpayne@68: jpayne@68: //As long as there is a nonempty read list... jpayne@68: while(ln!=null && ln.size()>0){ jpayne@68: // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access jpayne@68: jpayne@68: processList(ln); jpayne@68: jpayne@68: //Fetch a new list jpayne@68: ln=cris.nextList(); jpayne@68: } jpayne@68: jpayne@68: //Notify the input stream that the final list was used jpayne@68: if(ln!=null){ jpayne@68: cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: void processList(ListNum ln){ jpayne@68: //Grab the actual read list from the ListNum jpayne@68: final ArrayList reads=ln.list; jpayne@68: jpayne@68: // System.err.println(reads.size()); jpayne@68: jpayne@68: ArrayList orfList=new ArrayList(); jpayne@68: jpayne@68: //Loop through each read in the list jpayne@68: for(int idx=0; idx0){ jpayne@68: r2.reverseComplement(); jpayne@68: r1=r1.joinRead(insert); jpayne@68: r2=null; jpayne@68: } jpayne@68: }else if(ecco){ jpayne@68: BBMerge.findOverlapStrict(r1, r2, true); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: { jpayne@68: //Reads are processed in this block. jpayne@68: { jpayne@68: ArrayList list=processRead(r1); jpayne@68: if(list!=null){orfList.addAll(list);} jpayne@68: } jpayne@68: if(r2!=null){ jpayne@68: ArrayList list=processRead(r2); jpayne@68: if(list!=null){orfList.addAll(list);} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: genesOutT+=orfList.size(); jpayne@68: ByteBuilder bb=new ByteBuilder(); jpayne@68: jpayne@68: if(bsw!=null){ jpayne@68: if(bsw.ordered){ jpayne@68: for(Orf orf : orfList){ jpayne@68: orf.appendGff(bb); jpayne@68: bb.nl(); jpayne@68: } jpayne@68: bsw.add(bb, ln.id); jpayne@68: bytesOutT+=bb.length(); jpayne@68: }else{ jpayne@68: for(Orf orf : orfList){ jpayne@68: orf.appendGff(bb); jpayne@68: bb.nl(); jpayne@68: // if(bb.length()>=32000){ jpayne@68: // bytesOutT+=bb.length(); jpayne@68: // bsw.addJob(bb); jpayne@68: // bb=new ByteBuilder(); jpayne@68: // } jpayne@68: } jpayne@68: if(bb.length()>0){ jpayne@68: bsw.addJob(bb); jpayne@68: bytesOutT+=bb.length(); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: //Notify the input stream that the list was used jpayne@68: cris.returnList(ln); jpayne@68: // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Process a read or a read pair. jpayne@68: * @param r1 Read 1 jpayne@68: * @param r2 Read 2 (may be null) jpayne@68: * @return True if the reads should be kept, false if they should be discarded. jpayne@68: */ jpayne@68: ArrayList processRead(final Read r){ jpayne@68: ArrayList list=caller.callGenes(r, pgm); jpayne@68: jpayne@68: if(geneHistT!=null && list!=null){ jpayne@68: for(Orf o : list){ jpayne@68: int bin=Tools.min(geneHistT.length-1, o.length()/geneHistDiv); jpayne@68: geneHistT[bin]++; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(ros16S!=null){ jpayne@68: if(list!=null && !list.isEmpty()){ jpayne@68: // System.err.println("Looking for 16S."); jpayne@68: ArrayList ssu=fetchType(r, list, r16S); jpayne@68: if(ssu!=null && !ssu.isEmpty()){ jpayne@68: // System.err.println("Found "+ssu.size()+" 16S."); jpayne@68: ros16S.add(ssu, r.numericID); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(ros18S!=null){ jpayne@68: if(list!=null && !list.isEmpty()){ jpayne@68: ArrayList ssu=fetchType(r, list, r18S); jpayne@68: if(ssu!=null && !ssu.isEmpty()){ros18S.add(ssu, r.numericID);} jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(rosAmino!=null){ jpayne@68: if(mode==TRANSLATE){ jpayne@68: if(list!=null && !list.isEmpty()){ jpayne@68: ArrayList prots=translate(r, list); jpayne@68: if(prots!=null){rosAmino.add(prots, r.numericID);} jpayne@68: } jpayne@68: }else if(mode==RETRANSLATE) { jpayne@68: if(list!=null && !list.isEmpty()){ jpayne@68: ArrayList prots=translate(r, list); jpayne@68: ArrayList ret=detranslate(prots); jpayne@68: if(ret!=null){rosAmino.add(ret, r.numericID);} jpayne@68: } jpayne@68: }else if(mode==RECODE) { jpayne@68: if(list!=null && !list.isEmpty()){ jpayne@68: Read recoded=recode(r, list); jpayne@68: r.mate=null; jpayne@68: ArrayList rec=new ArrayList(1); jpayne@68: rec.add(recoded); jpayne@68: if(rec!=null){rosAmino.add(rec, r.numericID);} jpayne@68: } jpayne@68: }else{ jpayne@68: assert(false) : mode; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /** Number of reads processed by this thread */ jpayne@68: protected long readsInT=0; jpayne@68: /** Number of bases processed by this thread */ jpayne@68: protected long basesInT=0; jpayne@68: jpayne@68: /** Number of genes called by this thread */ jpayne@68: protected long genesOutT=0; jpayne@68: /** Number of bytes written by this thread */ jpayne@68: protected long bytesOutT=0; jpayne@68: jpayne@68: final long[] geneHistT; jpayne@68: jpayne@68: protected ConcurrentReadOutputStream rosAmino; jpayne@68: protected ConcurrentReadOutputStream ros16S; jpayne@68: protected ConcurrentReadOutputStream ros18S; jpayne@68: jpayne@68: /** True only if this thread has completed successfully */ jpayne@68: boolean success=false; jpayne@68: jpayne@68: /** Shared input stream */ jpayne@68: private final ConcurrentReadInputStream cris; jpayne@68: /** Shared output stream */ jpayne@68: private final ByteStreamWriter bsw; jpayne@68: /** Gene Model for annotation (not really needed) */ jpayne@68: private final GeneModel pgm; jpayne@68: /** Gene Caller for annotation */ jpayne@68: final GeneCaller caller; jpayne@68: /** Thread ID */ jpayne@68: final int tid; jpayne@68: } jpayne@68: jpayne@68: public static ArrayList fetchType(final Read r, final ArrayList list, int type){ jpayne@68: if(list==null || list.isEmpty()){return null;} jpayne@68: ArrayList ret=new ArrayList(list.size()); jpayne@68: for(int strand=0; strand<2; strand++){ jpayne@68: for(Orf orf : list){ jpayne@68: if(orf.strand==strand && orf.type==type){ jpayne@68: Read sequence=fetch(orf, r.bases, r.id); jpayne@68: ret.add(sequence); jpayne@68: } jpayne@68: } jpayne@68: r.reverseComplement(); jpayne@68: } jpayne@68: return (ret.size()>0 ? ret : null); jpayne@68: } jpayne@68: jpayne@68: public static ArrayList translate(final Read r, final ArrayList list){ jpayne@68: if(list==null || list.isEmpty()){return null;} jpayne@68: ArrayList prots=new ArrayList(list.size()); jpayne@68: for(int strand=0; strand<2; strand++){ jpayne@68: for(Orf orf : list){ jpayne@68: if(orf.strand==strand && orf.type==CDS){ jpayne@68: Read aa=translate(orf, r.bases, r.id); jpayne@68: prots.add(aa); jpayne@68: } jpayne@68: } jpayne@68: r.reverseComplement(); jpayne@68: } jpayne@68: return prots.isEmpty() ? null : prots; jpayne@68: } jpayne@68: jpayne@68: public static Read recode(final Read r, final ArrayList list){ jpayne@68: if(list==null || list.isEmpty()){return r;} jpayne@68: for(int strand=0; strand<2; strand++){ jpayne@68: for(Orf orf : list){ jpayne@68: if(orf.strand==strand && orf.type==CDS){ jpayne@68: recode(orf, r.bases); jpayne@68: } jpayne@68: } jpayne@68: r.reverseComplement(); jpayne@68: } jpayne@68: return r; jpayne@68: } jpayne@68: jpayne@68: public static ArrayList detranslate(final ArrayList prots){ jpayne@68: if(prots==null || prots.isEmpty()){return null;} jpayne@68: ArrayList nucs=new ArrayList(prots.size()); jpayne@68: for(int strand=0; strand<2; strand++){ jpayne@68: for(Read prot : prots){ jpayne@68: Read nuc=detranslate(prot); jpayne@68: nucs.add(nuc); jpayne@68: } jpayne@68: } jpayne@68: return nucs; jpayne@68: } jpayne@68: jpayne@68: public static Read translate(Orf orf, byte[] bases, String id){ jpayne@68: // assert(orf.length()%3==0) : orf.length(); //Happens sometimes on genes that go off the end, perhaps jpayne@68: if(orf.strand==1){orf.flip();} jpayne@68: byte[] acids=AminoAcid.toAAs(bases, orf.start, orf.stop); jpayne@68: if(orf.strand==1){orf.flip();} jpayne@68: Read r=new Read(acids, null, id+"\t"+(Shared.strandCodes[orf.strand]+"\t"+orf.start+"-"+orf.stop), 0, Read.AAMASK); jpayne@68: // assert((r.length()+1)*3==orf.length()); jpayne@68: return r; jpayne@68: } jpayne@68: jpayne@68: public static Read fetch(Orf orf, Read source){ jpayne@68: assert(orf.start>=0 && orf.stop=0 && orf.stop=0 ? AminoAcid.canonicalCodons[number] : "NNN"); jpayne@68: for(int i=0; i<3; i++, bpos++) { jpayne@68: bases[bpos]=(byte)codon.charAt(i); jpayne@68: } jpayne@68: } jpayne@68: if(orf.strand==1){orf.flip();} jpayne@68: } jpayne@68: jpayne@68: public static Read detranslate(Read prot){ jpayne@68: ByteBuilder bb=new ByteBuilder(prot.length()*3); jpayne@68: for(byte aa : prot.bases){ jpayne@68: int number=AminoAcid.acidToNumber[aa]; jpayne@68: String codon=(number>=0 ? AminoAcid.canonicalCodons[number] : "NNN"); jpayne@68: bb.append(codon); jpayne@68: } jpayne@68: Read r=new Read(bb.array, null, prot.id, prot.numericID, 0); jpayne@68: return r; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static GeneCaller makeGeneCaller(GeneModel pgm){ jpayne@68: GeneCaller caller=new GeneCaller(minLen, maxOverlapSameStrand, maxOverlapOppositeStrand, jpayne@68: minStartScore, minStopScore, minKmerScore, minOrfScore, minAvgScore, pgm); jpayne@68: return caller; jpayne@68: } jpayne@68: jpayne@68: private long maxReads=-1; jpayne@68: private boolean merge; jpayne@68: private boolean ecco; jpayne@68: jpayne@68: private long readsIn=0; jpayne@68: private long basesIn=0; jpayne@68: private long genesOut=0; jpayne@68: private long bytesOut=0; jpayne@68: jpayne@68: private static int minLen=80;//Actually a much higher value like 200 seems optimal compared to NCBI jpayne@68: private static int maxOverlapSameStrand=80; jpayne@68: private static int maxOverlapOppositeStrand=110; jpayne@68: jpayne@68: /* for kinnercds=6 */ jpayne@68: // private static float minStartScore=-0.10f; jpayne@68: // private static float minStopScore=-0.5f;//Not useful; disabled jpayne@68: // private static float minKmerScore=0.04f;//Does not seem useful. jpayne@68: // private static float minOrfScore=40f; //Higher increases SNR dramatically but reduces TP rate jpayne@68: // private static float minAvgScore=0.08f; //Not very effective jpayne@68: jpayne@68: /* for kinnercds=7 */ jpayne@68: private static float minStartScore=-0.10f; jpayne@68: private static float minStopScore=-0.5f;//Not useful; disabled jpayne@68: private static float minKmerScore=0.02f;//Does not seem useful. jpayne@68: private static float minOrfScore=50f; //Higher increases SNR dramatically but reduces TP rate jpayne@68: private static float minAvgScore=0.08f; //Not very effective jpayne@68: jpayne@68: long geneStopsMade=0; jpayne@68: long geneStartsMade=0; jpayne@68: long geneStartsRetained=0; jpayne@68: long geneStopsRetained=0; jpayne@68: long geneStartsOut=0; jpayne@68: jpayne@68: long tRNAOut=0; jpayne@68: long r16SOut=0; jpayne@68: long r23SOut=0; jpayne@68: long r5SOut=0; jpayne@68: long r18SOut=0; jpayne@68: jpayne@68: ScoreTracker stCds=new ScoreTracker(CDS); jpayne@68: ScoreTracker stCds2=new ScoreTracker(CDS); jpayne@68: ScoreTracker stCdsPass=new ScoreTracker(CDS); jpayne@68: ScoreTracker sttRNA=new ScoreTracker(tRNA); jpayne@68: ScoreTracker st16s=new ScoreTracker(r16S); jpayne@68: ScoreTracker st23s=new ScoreTracker(r23S); jpayne@68: ScoreTracker st5s=new ScoreTracker(r5S); jpayne@68: ScoreTracker st18s=new ScoreTracker(r18S); jpayne@68: jpayne@68: ScoreTracker[] trackers=new ScoreTracker[] {stCdsPass, sttRNA, st16s, st23s, st5s, st18s}; jpayne@68: jpayne@68: private int geneHistBins=2000; jpayne@68: private int geneHistDiv=20; jpayne@68: private final long[] geneHist; jpayne@68: private boolean printZeroCountHist=false; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: private ArrayList fnaList=new ArrayList(); jpayne@68: private ArrayList pgmList=new ArrayList(); jpayne@68: private String outGff=null; jpayne@68: private String outAmino=null; jpayne@68: private String out16S=null; jpayne@68: private String out18S=null; jpayne@68: private String compareToGff=null; jpayne@68: private String outStats="stderr"; jpayne@68: private String geneHistFile=null; jpayne@68: private boolean json_out=false; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Final Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: private final FileFormat ffoutGff; jpayne@68: private final FileFormat ffoutAmino; jpayne@68: private final FileFormat ffout16S; jpayne@68: private final FileFormat ffout18S; jpayne@68: jpayne@68: /** Determines how sequence is processed if it will be output */ jpayne@68: int mode=TRANSLATE; jpayne@68: jpayne@68: /** Translate nucleotides to amino acids */ jpayne@68: private static final int TRANSLATE=1; jpayne@68: /** Translate nucleotides to amino acids, jpayne@68: * then translate back to nucleotides */ jpayne@68: private static final int RETRANSLATE=2; jpayne@68: /** Re-encode coding regions of nucleotide jpayne@68: * sequences as a canonical codons */ jpayne@68: private static final int RECODE=3; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: private PrintStream outstream=System.err; jpayne@68: public boolean verbose=false; jpayne@68: public boolean errorState=false; jpayne@68: private boolean overwrite=false; jpayne@68: private boolean append=false; jpayne@68: private boolean ordered=false; //this is OK sometimes, but sometimes hangs (e.g. on RefSeq mito), possibly if a sequence produces nothing. jpayne@68: //To fix it, just ensure functions like translate always produce an array, even if it is empty. jpayne@68: jpayne@68: }