Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/KmerLimit2.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package sketch; import java.io.File; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Locale; import java.util.Random; import dna.AminoAcid; import fileIO.ByteFile; import fileIO.FileFormat; import fileIO.ReadWrite; import shared.Parse; import shared.Parser; import shared.PreParser; import shared.ReadStats; import shared.Shared; import shared.Timer; import shared.Tools; import stream.ConcurrentReadInputStream; import stream.ConcurrentReadOutputStream; import stream.FASTQ; import stream.FastaReadInputStream; import stream.Read; import structures.IntMap; import structures.ListNum; /** * * @author Brian Bushnell * @date July 30, 2018 * */ public class KmerLimit2 extends SketchObject { /*--------------------------------------------------------------*/ /*---------------- Initialization ----------------*/ /*--------------------------------------------------------------*/ /** * Code entrance from the command line. * @param args Command line arguments */ public static void main(String[] args){ //Start a timer immediately upon code entrance. Timer t=new Timer(); //Create an instance of this class KmerLimit2 x=new KmerLimit2(args); //Run the object x.process(t); //Close the print stream if it was redirected Shared.closeStream(x.outstream); } /** * Constructor. * @param args Command line arguments */ public KmerLimit2(String[] args){ {//Preparse block for help, config files, and outstream PreParser pp=new PreParser(args, getClass(), false); args=pp.args; outstream=pp.outstream; } boolean setInterleaved=false; //Whether interleaved was explicitly set. //Set shared static variables ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; ReadWrite.MAX_ZIP_THREADS=Shared.threads(); SketchObject.setKeyFraction(0.1); defaultParams.minEntropy=0; defaultParams.minProb=0.2f; boolean setHeapSize=false; int heapSize_=8091; long targetKmers_=0; int k_=32; int minCount_=1; //Create a parser object Parser parser=new Parser(); parser.overwrite=true; //Parse each argument for(int i=0; i<args.length; i++){ String arg=args[i]; //Break arguments into their constituent parts, in the form of "a=b" String[] split=arg.split("="); String a=split[0].toLowerCase(); String b=split.length>1 ? split[1] : null; if(b!=null && b.equalsIgnoreCase("null")){b=null;} if(a.equals("verbose")){ verbose=Parse.parseBoolean(b); }else if(a.equals("ordered")){ ordered=Parse.parseBoolean(b); }else if(a.equals("size") || a.equals("heapsize")){ heapSize_=Parse.parseIntKMG(b); setHeapSize=true; }else if(a.equals("kmers") || a.equals("target") || a.equals("limit")){ targetKmers_=Parse.parseKMG(b); }else if(a.equals("mincount")){ minCount_=Parse.parseIntKMG(b); }else if(a.equals("maxexpandedlength") || a.equals("maxlength") || a.equals("maxlen")){ maxExpandedLength=Parse.parseIntKMG(b); }else if(a.equals("seed")){ seed=Parse.parseKMG(b); }else if(a.equals("trials")){ trials=Parse.parseIntKMG(b); }else if(parseSketchFlags(arg, a, b)){ parser.parse(arg, a, b); }else if(defaultParams.parse(arg, a, b)){ parser.parse(arg, a, b); }else if(a.equals("parse_flag_goes_here")){ long fake_variable=Parse.parseKMG(b); //Set a variable here }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser //do nothing }else{ outstream.println("Unknown parameter "+args[i]); assert(false) : "Unknown parameter "+args[i]; } } if(!setHeapSize && minCount_>1){heapSize_=32000;} heapSize=heapSize_; targetKmers=targetKmers_; k=k_; minCount=minCount_; assert(targetKmers>0) : "Must set a kmer limit."; assert(heapSize>0) : "Heap size must be positive."; assert(k>0 && k<=32) : "0<k<33; k="+k; postParse(); // if(minCount>1){ // Shared.setBufferLen(800); // } {//Process parser fields Parser.processQuality(); maxReads=parser.maxReads; overwrite=ReadStats.overwrite=parser.overwrite; append=ReadStats.append=parser.append; setInterleaved=parser.setInterleaved; in1=parser.in1; in2=parser.in2; qfin1=parser.qfin1; qfin2=parser.qfin2; out1=parser.out1; out2=parser.out2; qfout1=parser.qfout1; qfout2=parser.qfout2; extin=parser.extin; extout=parser.extout; } //Do input file # replacement if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ in2=in1.replace("#", "2"); in1=in1.replace("#", "1"); } //Do output file # replacement if(out1!=null && out2==null && out1.indexOf('#')>-1){ out2=out1.replace("#", "2"); out1=out1.replace("#", "1"); } //Adjust interleaved detection based on the number of input files if(in2!=null){ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; } assert(FastaReadInputStream.settingsOK()); //Ensure there is an input file if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} //Adjust the number of threads for input file reading if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ ByteFile.FORCE_MODE_BF2=true; } //Ensure out2 is not set without out1 if(out1==null && out2!=null){throw new RuntimeException("Error - cannot define out2 without defining out1.");} //Adjust interleaved settings based on number of output files if(!setInterleaved){ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; if(in2!=null){ //If there are 2 input streams. FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); }else{ //There is one input stream. if(out2!=null){ FASTQ.FORCE_INTERLEAVED=true; FASTQ.TEST_INTERLEAVED=false; outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); } } } //Ensure output files can be written if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2); throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); } //Ensure input files can be read if(!Tools.testInputFiles(false, true, in1, in2)){ throw new RuntimeException("\nCan't read some input files.\n"); } //Ensure that no file was specified multiple times if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){ throw new RuntimeException("\nSome file names were specified multiple times.\n"); } //Create output FileFormat objects ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered); ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, ordered); //Create input FileFormat objects ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); minProb=defaultParams.minProb; minQual=defaultParams.minQual; shift=2*k; shift2=shift-2; mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 sharedHeap=new SketchHeap(heapSize, 0, true); } /*--------------------------------------------------------------*/ /*---------------- Outer Methods ----------------*/ /*--------------------------------------------------------------*/ /** Create read streams and process all data */ void process(Timer t){ //Turn off read validation in the input threads to increase speed final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; // //Optionally create a read output stream // final ConcurrentReadOutputStream ros; // if(ffout1!=null){ // //Select output buffer size based on whether it needs to be ordered // final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); // // //Notify user of output mode // if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){ // outstream.println("Writing interleaved."); // } // // ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false); // ros.start(); //Start the stream // }else{ros=null;} //Reset counters readsProcessed=readsOut=0; basesProcessed=basesOut=0; //Process the reads in separate threads spawnThreads0(); // if(verbose){outstream.println("Finished; closing streams.");} //Reset read validation Read.VALIDATE_IN_CONSTRUCTOR=vic; Sketch sketch=new Sketch(sharedHeap, true, true, null); sketch=capLengthAtCountSum(sketch, maxExpandedLength); final long reads=Tools.max(1, sketch.genomeSequences); final long targetReads=calcTargetReads(sketch, targetKmers, minCount, trials, seed); final double targetRate=Tools.min(1, targetReads/(double)reads); final String targetRateS=String.format(Locale.ROOT, "%.4f%%",targetRate*100); //Report timing and results t.stop(); outstream.println("Finished counting kmers."); outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); String kstring0=Tools.padKM(sketch.genomeSizeEstimate(minCount), 8); String rstring0=Tools.padKM(targetReads, 8); outstream.println("Unique Kmers: "+kstring0); outstream.println("Target Reads: "+rstring0+"\t"+targetRateS); // outstream.println("Reads: \t"+reads); // outstream.println("Unique Kmers: \t"+sketch.genomeSizeEstimate(minCount)); // outstream.println("Target Reads: \t"+targetReads); // outstream.println("Sample Rate: \t"+targetRateS); // outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); t.start(); outstream.println("\nSubsampling reads."); // String kstring=Tools.padKM(sharedHeap.genomeSizeEstimate(minCount), 8); // outstream.println("Unique Kmers Out: "+kstring); // ArrayList<String> args=new ArrayList<String>(); // args.add("in="+in1); // if(in2!=null){args.add("in2="+in2);} // args.add("out="+out1); // if(out2!=null){args.add("out2="+out2);} // args.add("ordered="+ordered); // args.add("ow="+(overwrite ? "t" : "f")); // if(targetRate<1){args.add("samplerate="+targetRateS);} // args.add("loglogout"); // args.add("loglogk="+k); // args.add("loglogminprob="+minProb); // BBDukF.main(args.toArray(new String[0])); // Sketch sk=new Sketch(sharedHeap, true, true, null); // outstream.println(sk.genomeSizeEstimate()); spawnThreads2(targetRate); t.stop(); outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); String kstring=Tools.padKM(sharedHeap.genomeSizeEstimate(minCount), 8); outstream.println("Unique Kmers Out: "+kstring); //Throw an exception of there was an error in a thread if(errorState){ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); } } /** Spawn process threads */ private void spawnThreads0(){ //Create a read input stream final ConcurrentReadInputStream cris; { cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); cris.start(); //Start the stream if(verbose){outstream.println("Started cris");} } paired=cris.paired(); if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} //Determine how many threads may be used final int threads=Tools.min(10, Shared.threads()); //Fill a list with ProcessThreads ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); for(int i=0; i<threads; i++){ alpt.add(new ProcessThread(cris, null, i, heapSize)); } //Start the threads for(ProcessThread pt : alpt){ pt.start(); } //Wait for completion of all threads boolean success=true; for(ProcessThread pt : alpt){ //Wait until this thread has terminated while(pt.getState()!=Thread.State.TERMINATED){ try { //Attempt a join operation pt.join(); } catch (InterruptedException e) { //Potentially handle this, if it is expected to occur e.printStackTrace(); } } //Accumulate per-thread statistics readsProcessed+=pt.readsProcessedT; basesProcessed+=pt.basesProcessedT; readsOut+=pt.readsOutT; basesOut+=pt.basesOutT; success&=pt.success; } //Track whether any threads failed if(!success){errorState=true;} //Do anything necessary after processing //Close the read streams errorState|=ReadWrite.closeStreams(cris); } /** Spawn process threads */ private void spawnThreads2(double rate){ //Create a read input stream final ConcurrentReadInputStream cris; { cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); cris.setSampleRate((float)rate, seed); cris.start(); //Start the stream if(verbose){outstream.println("Started cris");} } // paired=cris.paired(); // if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} //Optionally create a read output stream final ConcurrentReadOutputStream ros; if(ffout1!=null){ //Select output buffer size based on whether it needs to be ordered final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); //Notify user of output mode if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){ outstream.println("Writing interleaved."); } ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false); ros.start(); //Start the stream }else{ros=null;} //Determine how many threads may be used final int threads=Tools.min(10, Shared.threads()); sharedHeap.clear(); // readsProcessed=0; // basesProcessed=0; readsOut=0; basesOut=0; //Fill a list with ProcessThreads ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); for(int i=0; i<threads; i++){ alpt.add(new ProcessThread(cris, ros, i, heapSize)); } //Start the threads for(ProcessThread pt : alpt){ pt.start(); } //Wait for completion of all threads boolean success=true; for(ProcessThread pt : alpt){ //Wait until this thread has terminated while(pt.getState()!=Thread.State.TERMINATED){ try { //Attempt a join operation pt.join(); } catch (InterruptedException e) { //Potentially handle this, if it is expected to occur e.printStackTrace(); } } //Accumulate per-thread statistics // readsProcessed+=pt.readsProcessedT; // basesProcessed+=pt.basesProcessedT; readsOut+=pt.readsOutT; basesOut+=pt.basesOutT; success&=pt.success; } //Track whether any threads failed if(!success){errorState=true;} //Do anything necessary after processing //Write anything that was accumulated by ReadStats errorState|=ReadStats.writeAll(); //Close the read streams errorState|=ReadWrite.closeStreams(cris, ros); } /*--------------------------------------------------------------*/ /*---------------- Inner Methods ----------------*/ /*--------------------------------------------------------------*/ public static Sketch capLengthAtCountSum(Sketch sketch0, int max) { int len=0; long sum=0; for(; len<sketch0.keyCounts.length; len++){ sum=sum+sketch0.keyCounts[len]; if(sum>max){break;} } if(len>=sketch0.length()){return sketch0;} long[] keys=Arrays.copyOf(sketch0.keys, len); int[] counts=Arrays.copyOf(sketch0.keyCounts, len); // long[] array_, int[] counts_, int taxID_, long imgID_, long gSizeBases_, long gSizeKmers_, long gSequences_, double probCorrect_, // String taxName_, String name0_, String fname_, ArrayList<String> meta_ Sketch sk=new Sketch(keys, counts, null, null, null, -1, -1, sketch0.genomeSizeBases, sketch0.genomeSizeKmers, sketch0.genomeSequences, sketch0.probCorrect, null, null, null, null); return sk; } public static long calcTargetReads(Sketch sketch, long targetKmers, int minCount, int trials, long seed){ final int[] counts0=sketch.keyCounts; final int[] counts=Arrays.copyOf(counts0, counts0.length); final long size=sketch.genomeSizeEstimate(minCount); final long reads=sketch.genomeSequences; final double targetKmerFraction=targetKmers/(double)size; if(targetKmerFraction>=1){return reads;} final int targetKeys=(int)(targetKmerFraction*counts.length); final long countSum=Tools.sum(counts0); assert(countSum<Shared.MAX_ARRAY_LEN) : countSum; // System.err.println("countsum: "+countSum); final IntMap map=new IntMap(0, counts0.length); final int[] expanded=new int[(int)countSum]; long roundSum=0; final Random randy=Shared.threadLocalRandom(seed); for(int i=0; i<trials; i++){ Tools.fill(counts, counts0); // long rounds=reduceRounds(counts0, counts, minCount, targetKeys, randy); long rounds=reduceRoundsIM(counts0, expanded, minCount, targetKeys, randy, map); roundSum+=rounds; } double avgRounds=roundSum/(double)trials; // System.err.println("avgRounds: "+avgRounds); double targetCountFraction=1-(avgRounds/countSum); // System.err.println("targetFraction: "+targetCountFraction); return (long)(targetCountFraction*reads); } // public static int reduceRoundsOld(final int[] counts, final int minCount, final int targetKeys, final Random randy){ // assert(minCount>=0) : minCount; // int rounds=0; // int valid=0; // for(int x : counts){ // if(x>=minCount){valid++;} // } // // int len=counts.length; // System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)+", "+Arrays.toString(counts)); // for(; valid>targetKeys; rounds++){ // int pos=randy.nextInt(len); //// assert(counts[pos]>0) : pos+"/"+len+": "+targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Arrays.toString(counts); // if(counts[pos]==minCount){valid--;} // counts[pos]--; // if(counts[pos]==0){ // len--;//shrink the array // System.err.println("len="+len+", counts[len]="+counts[len]); // System.err.println("pos="+pos+", counts[pos]="+counts[pos]); // counts[pos]=counts[len];//move the last element to the empty slot // counts[len]=0; // if(pos!=len && len>0){ // assert(counts[pos]>0) : pos+"/"+len+": "+targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Arrays.toString(counts); // } // } // System.err.println(len+", "+pos+": "+Arrays.toString(counts)); // } // // System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)); // // return rounds; // } //This can be done faster with bins. //Each bin contains all kmers with count x. When a bin is hit, one kmer moves to the next bin lower. //Alternately, expand the array into one physical kmer per count. Store the current counts in an IntMap. Remove key each time. public static long reduceRounds(final int[] counts0, final int[] counts, final int minCount, final int targetKeys, final Random randy){ assert(minCount>=0) : minCount; long rounds=0; int valid=0; for(int x : counts){ if(x>=minCount){valid++;} } int len=counts.length; final long sum0=Tools.sum(counts); long sum=sum0; // System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)+", "+Arrays.toString(counts)); for(; valid>targetKeys; rounds++){ long posNum=(Long.MAX_VALUE&randy.nextLong())%sum; long sum2=0; int pos=0; for(int i=0; i<counts.length; i++){ int x=counts[i]; if(x>0){ sum2+=x; if(sum2>=posNum){ pos=i; break; } } } // for(int i=0; i<counts0.length; i++){ // int x=counts0[i]; // if(x>0){ // sum2+=x; // if(sum2>=posNum){ // pos=i; // break; // } // } // } sum--; assert(counts[pos]>0) : pos+"/"+len+": "+targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Arrays.toString(counts); if(counts[pos]==minCount){valid--;} counts[pos]--; if(counts[pos]==0){ len--;//shrink the array } // System.err.println(len+", "+pos+": "+Arrays.toString(counts)); } // System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)); return rounds; } //This can be done faster with bins. //Each bin contains all kmers with count x. When a bin is hit, one kmer moves to the next bin lower. //Alternately, expand the array into one physical kmer per count. Store the current counts in an IntMap. Remove key each time. public static long reduceRoundsIM(final int[] counts0, final int[] expanded, final int minCount, final int targetKeys, final Random randy, final IntMap map){ assert(minCount>=0) : minCount; long rounds=0; int valid=0; map.clear(); for(int i=0, k=0; i<counts0.length; i++){ int x=counts0[i]; // counts[i]=counts0[i]; if(x>=minCount){valid++;} map.put(i, x); for(int j=0; j<x; j++, k++){ expanded[k]=i; } } assert(expanded.length==Tools.sum(counts0)); int len=expanded.length; // System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)+", "+Arrays.toString(counts)); for(; valid>targetKeys; rounds++){ final int pos=randy.nextInt(len); final int key=expanded[pos]; final int x=map.get(key); assert(x>0); if(x==minCount){valid--;} map.put(key, x-1); len--;//shrink the array // System.err.println("len="+len+", counts[len]="+counts[len]); // System.err.println("pos="+pos+", counts[pos]="+counts[pos]); expanded[pos]=expanded[len];//move the last element to the empty slot expanded[len]=0; // System.err.println(len+", "+pos+": "+Arrays.toString(counts)); } // System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)); return rounds; } /*--------------------------------------------------------------*/ /*---------------- Inner Classes ----------------*/ /*--------------------------------------------------------------*/ /** This class is static to prevent accidental writing to shared variables. * It is safe to remove the static modifier. */ private class ProcessThread extends Thread { //Constructor ProcessThread(final ConcurrentReadInputStream cris_, final ConcurrentReadOutputStream ros_, final int tid_, final int size){ cris=cris_; ros=ros_; tid=tid_; localHeap=new SketchHeap(size, 0, true); } //Called by start() @Override public void run(){ //Do anything necessary prior to processing //Process the reads processInner(); //Do anything necessary after processing dumpHeap(); //Indicate successful exit status success=true; } /** Iterate through the reads */ void processInner(){ //Grab the first ListNum of reads ListNum<Read> ln=cris.nextList(); //Grab the actual read list from the ListNum ArrayList<Read> reads=(ln!=null ? ln.list : null); //Check to ensure pairing is as expected if(reads!=null && !reads.isEmpty()){ Read r=reads.get(0); // assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access } //As long as there is a nonempty read list... while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access //Loop through each read in the list for(int idx=0; idx<reads.size(); idx++){ final Read r1=reads.get(idx); final Read r2=r1.mate; //Validate reads in worker threads if(!r1.validated()){r1.validate(true);} if(r2!=null && !r2.validated()){r2.validate(true);} //Track the initial length for statistics final int initialLength1=r1.length(); final int initialLength2=r1.mateLength(); //Increment counters readsProcessedT+=r1.pairCount(); basesProcessedT+=initialLength1+initialLength2; //Reads are processed in this block. processReadPair(r1, r2); } if(ros!=null){ for(Read r1 : reads){ readsOutT+=r1.pairCount(); basesOutT+=r1.pairLength(); } //Output reads to the output stream if(ros!=null){ros.add(reads, ln.id);} } //Notify the input stream that the list was used cris.returnList(ln); // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access //Fetch a new list ln=cris.nextList(); reads=(ln!=null ? ln.list : null); } //Notify the input stream that the final list was used if(ln!=null){ if(ln.list!=null){ln.list.clear();} cris.returnList(ln.id, true); } } /** * Process a read or a read pair. * @param r1 Read 1 * @param r2 Read 2 (may be null) */ void processReadPair(final Read r1, final Read r2){ processReadNucleotide(r1); if(r2!=null){processReadNucleotide(r2);} } void processReadNucleotide(final Read r){ final byte[] bases=r.bases; final byte[] quals=r.quality; long kmer=0; long rkmer=0; int len=0; assert(!r.aminoacid()); final long min=minHashValue; localHeap.genomeSizeBases+=r.length(); localHeap.genomeSequences++; if(quals==null || (minProb<=0 && minQual<2)){ for(int i=0; i<bases.length; i++){ byte b=bases[i]; long x=AminoAcid.baseToNumber[b]; long x2=AminoAcid.baseToComplementNumber[b]; kmer=((kmer<<2)|x)&mask; rkmer=((rkmer>>>2)|(x2<<shift2))&mask; if(x<0){len=0; rkmer=0;}else{len++;} if(len>=k){ localHeap.genomeSizeKmers++; final long hashcode=hash(kmer, rkmer); if(hashcode>min){localHeap.add(hashcode);} } } }else{ float prob=1; for(int i=0; i<bases.length; i++){ final byte b=bases[i]; final long x=AminoAcid.baseToNumber[b]; final long x2=AminoAcid.baseToComplementNumber[b]; {//Quality-related stuff final byte q=quals[i]; assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual; prob=prob*align2.QualityTools.PROB_CORRECT[q]; if(len>k){ byte oldq=quals[i-k]; prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; } if(x<0 || q<minQual){ len=0; kmer=rkmer=0; prob=1; }else{ len++; } } kmer=((kmer<<2)|x)&mask; rkmer=((rkmer>>>2)|(x2<<shift2))&mask; if(len>=k && prob>=minProb){ localHeap.genomeSizeKmers++; localHeap.probSum+=prob; final long hashcode=hash(kmer, rkmer); if(hashcode>min){localHeap.checkAndAdd(hashcode);} } } } } private void dumpHeap(){ synchronized(sharedHeap){ sharedHeap.add(localHeap); } } /** Number of reads processed by this thread */ protected long readsProcessedT=0; /** Number of bases processed by this thread */ protected long basesProcessedT=0; /** Number of reads retained by this thread */ protected long readsOutT=0; /** Number of bases retained by this thread */ protected long basesOutT=0; /** True only if this thread has completed successfully */ boolean success=false; /** Shared input stream */ private final ConcurrentReadInputStream cris; /** Shared output stream */ private final ConcurrentReadOutputStream ros; /** Thread ID */ final int tid; final SketchHeap localHeap; } /*--------------------------------------------------------------*/ /*---------------- Fields ----------------*/ /*--------------------------------------------------------------*/ /** Primary input file path */ private String in1=null; /** Secondary input file path */ private String in2=null; private String qfin1=null; private String qfin2=null; /** Primary output file path */ private String out1=null; /** Secondary output file path */ private String out2=null; private String qfout1=null; private String qfout2=null; /** Override input file extension */ private String extin=null; /** Override output file extension */ private String extout=null; /*--------------------------------------------------------------*/ /** Number of reads processed */ protected long readsProcessed=0; /** Number of bases processed */ protected long basesProcessed=0; /** Number of reads retained */ protected long readsOut=0; /** Number of bases retained */ protected long basesOut=0; /** Quit after processing this many input reads; -1 means no limit */ private long maxReads=-1; private boolean paired=false; private int trials=25; private long seed=-1; private int maxExpandedLength=50000000; /*--------------------------------------------------------------*/ /*---------------- Final Fields ----------------*/ /*--------------------------------------------------------------*/ /** Primary input file */ private final FileFormat ffin1; /** Secondary input file */ private final FileFormat ffin2; /** Primary output file */ private final FileFormat ffout1; /** Secondary output file */ private final FileFormat ffout2; private final SketchHeap sharedHeap; private final int heapSize; private final long targetKmers; private final int minCount; final int shift; final int shift2; final long mask; final float minProb; final byte minQual; /*--------------------------------------------------------------*/ /*---------------- Common Fields ----------------*/ /*--------------------------------------------------------------*/ /** Print status messages to this output stream */ private PrintStream outstream=System.err; /** Print verbose messages */ public static boolean verbose=false; /** True if an error was encountered */ public boolean errorState=false; /** Overwrite existing output files */ private boolean overwrite=true; /** Append to existing output files */ private boolean append=false; /** Reads are output in input order (not enabled) */ private boolean ordered=true; }