jpayne@68: package clump; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.io.PrintStream; jpayne@68: import java.util.ArrayList; jpayne@68: jpayne@68: import bloom.KCountArray; jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.FileFormat; jpayne@68: import fileIO.ReadWrite; jpayne@68: import jgi.BBMerge; jpayne@68: import shared.KillSwitch; jpayne@68: import shared.Parse; jpayne@68: import shared.Parser; jpayne@68: import shared.PreParser; jpayne@68: import shared.ReadStats; jpayne@68: import shared.Shared; jpayne@68: import shared.Timer; jpayne@68: import shared.Tools; jpayne@68: import stream.ConcurrentReadInputStream; jpayne@68: import stream.ConcurrentReadOutputStream; jpayne@68: import stream.FASTQ; jpayne@68: import stream.FastaReadInputStream; jpayne@68: import stream.Read; jpayne@68: import structures.ListNum; jpayne@68: import structures.Quantizer; jpayne@68: jpayne@68: /** jpayne@68: * @author Brian Bushnell jpayne@68: * @date June 20, 2014 jpayne@68: * jpayne@68: */ jpayne@68: public class KmerSplit { jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Initialization ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Code entrance from the command line. jpayne@68: * @param args Command line arguments jpayne@68: */ jpayne@68: public static void main(String[] args){ jpayne@68: final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ; jpayne@68: final boolean oldFInt=FASTQ.FORCE_INTERLEAVED, oldTInt=FASTQ.TEST_INTERLEAVED; jpayne@68: final int zl=ReadWrite.ZIPLEVEL; jpayne@68: final float ztd=ReadWrite.ZIP_THREAD_MULT; jpayne@68: final int mzt=ReadWrite.MAX_ZIP_THREADS; jpayne@68: Timer t=new Timer(); jpayne@68: KmerSplit x=new KmerSplit(args); jpayne@68: ReadWrite.ZIPLEVEL=Tools.min(ReadWrite.ZIPLEVEL, maxZipLevel); jpayne@68: x.process(t); jpayne@68: ReadWrite.USE_PIGZ=pigz; jpayne@68: ReadWrite.USE_UNPIGZ=unpigz; jpayne@68: ReadWrite.ZIPLEVEL=zl; jpayne@68: ReadWrite.ZIP_THREAD_MULT=ztd; jpayne@68: ReadWrite.MAX_ZIP_THREADS=mzt; jpayne@68: FASTQ.FORCE_INTERLEAVED=oldFInt; jpayne@68: FASTQ.TEST_INTERLEAVED=oldTInt; jpayne@68: jpayne@68: //Close the print stream if it was redirected jpayne@68: Shared.closeStream(x.outstream); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Constructor. jpayne@68: * @param args Command line arguments jpayne@68: */ jpayne@68: public KmerSplit(String[] args){ jpayne@68: jpayne@68: {//Preparse block for help, config files, and outstream jpayne@68: PreParser pp=new PreParser(args, getClass(), false); jpayne@68: args=pp.args; jpayne@68: outstream=pp.outstream; jpayne@68: } jpayne@68: jpayne@68: ReadWrite.USE_PIGZ=false; jpayne@68: ReadWrite.USE_UNPIGZ=true; jpayne@68: ReadWrite.MAX_ZIP_THREADS=Shared.threads(); jpayne@68: jpayne@68: boolean setInterleaved=false; //Whether it was explicitly set. jpayne@68: Parser parser=new Parser(); jpayne@68: jpayne@68: for(int i=0; i1 ? split[1] : null; jpayne@68: jpayne@68: if(parser.parse(arg, a, b)){ jpayne@68: //do nothing jpayne@68: }else if(a.equals("verbose")){ jpayne@68: verbose=KmerComparator.verbose=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("parse_flag_goes_here")){ jpayne@68: //Set a variable here jpayne@68: }else if(a.equals("k")){ jpayne@68: k=Integer.parseInt(b); jpayne@68: assert(k>0 && k<32); jpayne@68: }else if(a.equals("mincount") || a.equals("mincr")){ jpayne@68: minCount=Integer.parseInt(b); jpayne@68: }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){ jpayne@68: groups=Integer.parseInt(b); jpayne@68: }else if(a.equals("rename") || a.equals("addname")){ jpayne@68: //Do nothing jpayne@68: // addName=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("shortname") || a.equals("shortnames")){ jpayne@68: if(b!=null && b.equals("shrink")){ jpayne@68: shrinkName=true; jpayne@68: }else{ jpayne@68: shrinkName=false; jpayne@68: shortName=Parse.parseBoolean(b); jpayne@68: } jpayne@68: }else if(a.equals("rcomp") || a.equals("reversecomplement")){ jpayne@68: //ignore rcomp=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("condense") || a.equals("consensus") || a.equals("concensus")){//Note the last one is intentionally misspelled jpayne@68: //ignore jpayne@68: }else if(a.equals("correct") || a.equals("ecc")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("passes")){ jpayne@68: int x=Integer.parseInt(b); jpayne@68: // if(x>1){outstream.println("Warning: KmerSplit does not support multiple passes.");} jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("dedupe")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("entryfilter")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("markduplicates")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("markall")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("addcount") || a.equals("renamebycount")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("optical") || a.equals("opticalonly")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("scanlimit") || a.equals("scan")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("removeallduplicates") || a.equals("allduplicates")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("allowns")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){ jpayne@68: //ignore jpayne@68: }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("printduplicates")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("dupeidentity")){ jpayne@68: //ignore jpayne@68: }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){ jpayne@68: //ignore jpayne@68: } jpayne@68: jpayne@68: else if(a.equals("prefilter")){ jpayne@68: KmerReduce.prefilter=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("ecco")){ jpayne@68: ecco=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("seed")){ jpayne@68: KmerComparator.defaultSeed=Long.parseLong(b); jpayne@68: }else if(a.equals("hashes")){ jpayne@68: KmerComparator.setHashes(Integer.parseInt(b)); jpayne@68: }else if(a.equals("border")){ jpayne@68: KmerComparator.defaultBorder=Integer.parseInt(b); jpayne@68: }else if(a.equals("minprob")){ jpayne@68: KmerComparator.minProb=Float.parseFloat(b); jpayne@68: }else if(a.equals("unpair")){ jpayne@68: unpair=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("repair")){ jpayne@68: //Do nothing jpayne@68: }else if(a.equals("namesort") || a.equals("sort")){ jpayne@68: //Do nothing jpayne@68: }else if(a.equals("fetchthreads")){ jpayne@68: //Do nothing jpayne@68: }else if(a.equals("reorder") || a.equals("reorderclumps")){ jpayne@68: //reorder=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("reorderpaired") || a.equals("reorderclumpspaired")){ jpayne@68: // reorderpaired=Parse.parseBoolean(b); jpayne@68: } jpayne@68: jpayne@68: jpayne@68: else if(Clump.parseStatic(arg, a, b)){ jpayne@68: //Do nothing jpayne@68: } jpayne@68: jpayne@68: else{ jpayne@68: outstream.println("Unknown parameter "+args[i]); jpayne@68: assert(false) : "Unknown parameter "+args[i]; jpayne@68: // throw new RuntimeException("Unknown parameter "+args[i]); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: {//Process parser fields jpayne@68: Parser.processQuality(); jpayne@68: jpayne@68: maxReads=parser.maxReads; jpayne@68: jpayne@68: overwrite=ReadStats.overwrite=parser.overwrite; jpayne@68: append=ReadStats.append=parser.append; jpayne@68: jpayne@68: setInterleaved=parser.setInterleaved; jpayne@68: jpayne@68: in1=parser.in1; jpayne@68: in2=parser.in2; jpayne@68: jpayne@68: out1=parser.out1; jpayne@68: jpayne@68: extin=parser.extin; jpayne@68: extout=parser.extout; jpayne@68: } jpayne@68: jpayne@68: if(groups>2){ReadWrite.USE_PIGZ=false;} jpayne@68: jpayne@68: if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ jpayne@68: in2=in1.replace("#", "2"); jpayne@68: in1=in1.replace("#", "1"); jpayne@68: } jpayne@68: if(in2!=null){ jpayne@68: if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} jpayne@68: FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; jpayne@68: } jpayne@68: jpayne@68: assert(FastaReadInputStream.settingsOK()); jpayne@68: jpayne@68: if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} jpayne@68: if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ jpayne@68: ByteFile.FORCE_MODE_BF2=true; jpayne@68: } jpayne@68: jpayne@68: if(!setInterleaved){ jpayne@68: assert(in1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\n"; jpayne@68: if(in2!=null){ //If there are 2 input streams. jpayne@68: FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; jpayne@68: outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} jpayne@68: jpayne@68: if(out1!=null){ jpayne@68: assert(out1.contains("%")); jpayne@68: outArray=new String[groups]; jpayne@68: for(int i=0; i1){ReadWrite.setZipThreadMult(Tools.min(0.5f, 2f/(groups+1)));} jpayne@68: for(int i=0; i1){ jpayne@68: table=ClumpTools.getTable(in1, in2, k, minCount); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /** Create read streams and process all data */ jpayne@68: void process(Timer t){ jpayne@68: jpayne@68: preprocess(); jpayne@68: jpayne@68: final ConcurrentReadInputStream cris; jpayne@68: { jpayne@68: cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null); jpayne@68: cris.start(); jpayne@68: if(verbose){outstream.println("Started cris");} jpayne@68: } jpayne@68: boolean paired=cris.paired(); jpayne@68: if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} jpayne@68: if(cris.paired() && (in1==null || !in1.contains(".sam") && !unpair)){ jpayne@68: outstream.println("Writing interleaved."); jpayne@68: } jpayne@68: jpayne@68: final ConcurrentReadOutputStream ros[]=new ConcurrentReadOutputStream[groups]; jpayne@68: try { jpayne@68: for(int i=0; i alht=new ArrayList(threads); jpayne@68: for(int i=0; i ln=cris.nextList(); jpayne@68: ArrayList reads=(ln!=null ? ln.list : null); jpayne@68: jpayne@68: ArrayList[] array=new ArrayList[groups]; jpayne@68: for(int i=0; i(buffer); jpayne@68: } jpayne@68: jpayne@68: while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning jpayne@68: jpayne@68: for(Read r : reads){ jpayne@68: if(!r.validated()){ jpayne@68: r.validate(true); jpayne@68: if(r.mate!=null){r.mate.validate(true);} jpayne@68: } jpayne@68: readsProcessedT+=1+r.mateCount(); jpayne@68: basesProcessedT+=r.length()+r.mateLength(); jpayne@68: diskProcessedT+=r.countFastqBytes()+r.countMateFastqBytes(); jpayne@68: memProcessedT+=r.countBytes()+r.countMateBytes()+ReadKey.overhead; jpayne@68: if(shrinkName){ jpayne@68: Clumpify.shrinkName(r); jpayne@68: Clumpify.shrinkName(r.mate); jpayne@68: }else if(shortName){ jpayne@68: Clumpify.shortName(r); jpayne@68: Clumpify.shortName(r.mate); jpayne@68: } jpayne@68: jpayne@68: if(quantizeQuality){ jpayne@68: Quantizer.quantize(r, r.mate); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(ecco){ jpayne@68: for(Read r : reads){ jpayne@68: if(r.mate!=null){BBMerge.findOverlapStrict(r, r.mate, true);} jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: ArrayList hashList=reads; jpayne@68: if(paired && unpair){ jpayne@68: hashList=new ArrayList(reads.size()*2); jpayne@68: for(Read r1 : reads){ jpayne@68: Read r2=r1.mate; jpayne@68: hashList.add(r1); jpayne@68: hashList.add(r2); jpayne@68: r1.mate=null; jpayne@68: r2.mate=null; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: kc.hash(hashList, table, minCount, true); jpayne@68: for(Read r : hashList){ jpayne@68: long kmer=((ReadKey)r.obj).kmer; jpayne@68: long code=kc.hash(kmer); jpayne@68: int code2=(int)(code%groups); jpayne@68: assert(code2>=0 && code2=buffer){ jpayne@68: ros[code2].add(array[code2], 0); jpayne@68: array[code2]=new ArrayList(buffer); jpayne@68: } jpayne@68: } jpayne@68: cris.returnList(ln); jpayne@68: ln=cris.nextList(); jpayne@68: reads=(ln!=null ? ln.list : null); jpayne@68: } jpayne@68: if(ln!=null){ jpayne@68: cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); jpayne@68: } jpayne@68: for(int i=0; i