Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/KmerLimit2.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/KmerLimit2.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,971 @@ +package sketch; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Locale; +import java.util.Random; + +import dna.AminoAcid; +import fileIO.ByteFile; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.ReadStats; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import stream.ConcurrentReadInputStream; +import stream.ConcurrentReadOutputStream; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; +import structures.IntMap; +import structures.ListNum; + +/** + * + * @author Brian Bushnell + * @date July 30, 2018 + * + */ +public class KmerLimit2 extends SketchObject { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + //Start a timer immediately upon code entrance. + Timer t=new Timer(); + + //Create an instance of this class + KmerLimit2 x=new KmerLimit2(args); + + //Run the object + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public KmerLimit2(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + boolean setInterleaved=false; //Whether interleaved was explicitly set. + + //Set shared static variables + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + SketchObject.setKeyFraction(0.1); + defaultParams.minEntropy=0; + defaultParams.minProb=0.2f; + + boolean setHeapSize=false; + int heapSize_=8091; + long targetKmers_=0; + int k_=32; + int minCount_=1; + + //Create a parser object + Parser parser=new Parser(); + parser.overwrite=true; + + //Parse each argument + for(int i=0; i<args.length; i++){ + String arg=args[i]; + + //Break arguments into their constituent parts, in the form of "a=b" + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); + }else if(a.equals("ordered")){ + ordered=Parse.parseBoolean(b); + }else if(a.equals("size") || a.equals("heapsize")){ + heapSize_=Parse.parseIntKMG(b); + setHeapSize=true; + }else if(a.equals("kmers") || a.equals("target") || a.equals("limit")){ + targetKmers_=Parse.parseKMG(b); + }else if(a.equals("mincount")){ + minCount_=Parse.parseIntKMG(b); + }else if(a.equals("maxexpandedlength") || a.equals("maxlength") || a.equals("maxlen")){ + maxExpandedLength=Parse.parseIntKMG(b); + }else if(a.equals("seed")){ + seed=Parse.parseKMG(b); + }else if(a.equals("trials")){ + trials=Parse.parseIntKMG(b); + }else if(parseSketchFlags(arg, a, b)){ + parser.parse(arg, a, b); + }else if(defaultParams.parse(arg, a, b)){ + parser.parse(arg, a, b); + }else if(a.equals("parse_flag_goes_here")){ + long fake_variable=Parse.parseKMG(b); + //Set a variable here + }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser + //do nothing + }else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + } + } + + if(!setHeapSize && minCount_>1){heapSize_=32000;} + heapSize=heapSize_; + targetKmers=targetKmers_; + k=k_; + minCount=minCount_; + assert(targetKmers>0) : "Must set a kmer limit."; + assert(heapSize>0) : "Heap size must be positive."; + assert(k>0 && k<=32) : "0<k<33; k="+k; + postParse(); + +// if(minCount>1){ +// Shared.setBufferLen(800); +// } + + {//Process parser fields + Parser.processQuality(); + + maxReads=parser.maxReads; + + overwrite=ReadStats.overwrite=parser.overwrite; + append=ReadStats.append=parser.append; + setInterleaved=parser.setInterleaved; + + in1=parser.in1; + in2=parser.in2; + qfin1=parser.qfin1; + qfin2=parser.qfin2; + + out1=parser.out1; + out2=parser.out2; + qfout1=parser.qfout1; + qfout2=parser.qfout2; + + extin=parser.extin; + extout=parser.extout; + } + + //Do input file # replacement + if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ + in2=in1.replace("#", "2"); + in1=in1.replace("#", "1"); + } + + //Do output file # replacement + if(out1!=null && out2==null && out1.indexOf('#')>-1){ + out2=out1.replace("#", "2"); + out1=out1.replace("#", "1"); + } + + //Adjust interleaved detection based on the number of input files + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + assert(FastaReadInputStream.settingsOK()); + + //Ensure there is an input file + if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} + + //Adjust the number of threads for input file reading + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ + ByteFile.FORCE_MODE_BF2=true; + } + + //Ensure out2 is not set without out1 + if(out1==null && out2!=null){throw new RuntimeException("Error - cannot define out2 without defining out1.");} + + //Adjust interleaved settings based on number of output files + if(!setInterleaved){ + assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; + if(in2!=null){ //If there are 2 input streams. + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else{ //There is one input stream. + if(out2!=null){ + FASTQ.FORCE_INTERLEAVED=true; + FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + } + } + + //Ensure output files can be written + if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){ + outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); + } + + //Ensure input files can be read + if(!Tools.testInputFiles(false, true, in1, in2)){ + throw new RuntimeException("\nCan't read some input files.\n"); + } + + //Ensure that no file was specified multiple times + if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){ + throw new RuntimeException("\nSome file names were specified multiple times.\n"); + } + + //Create output FileFormat objects + ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered); + ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, ordered); + + //Create input FileFormat objects + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); + + minProb=defaultParams.minProb; + minQual=defaultParams.minQual; + + shift=2*k; + shift2=shift-2; + mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 + sharedHeap=new SketchHeap(heapSize, 0, true); + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Create read streams and process all data */ + void process(Timer t){ + + //Turn off read validation in the input threads to increase speed + final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; + Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; + +// //Optionally create a read output stream +// final ConcurrentReadOutputStream ros; +// if(ffout1!=null){ +// //Select output buffer size based on whether it needs to be ordered +// final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); +// +// //Notify user of output mode +// if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){ +// outstream.println("Writing interleaved."); +// } +// +// ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false); +// ros.start(); //Start the stream +// }else{ros=null;} + + //Reset counters + readsProcessed=readsOut=0; + basesProcessed=basesOut=0; + + //Process the reads in separate threads + spawnThreads0(); + +// if(verbose){outstream.println("Finished; closing streams.");} + + //Reset read validation + Read.VALIDATE_IN_CONSTRUCTOR=vic; + + + Sketch sketch=new Sketch(sharedHeap, true, true, null); + sketch=capLengthAtCountSum(sketch, maxExpandedLength); + final long reads=Tools.max(1, sketch.genomeSequences); + final long targetReads=calcTargetReads(sketch, targetKmers, minCount, trials, seed); + final double targetRate=Tools.min(1, targetReads/(double)reads); + final String targetRateS=String.format(Locale.ROOT, "%.4f%%",targetRate*100); + + //Report timing and results + t.stop(); + outstream.println("Finished counting kmers."); + outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); + + String kstring0=Tools.padKM(sketch.genomeSizeEstimate(minCount), 8); + String rstring0=Tools.padKM(targetReads, 8); + outstream.println("Unique Kmers: "+kstring0); + outstream.println("Target Reads: "+rstring0+"\t"+targetRateS); + +// outstream.println("Reads: \t"+reads); +// outstream.println("Unique Kmers: \t"+sketch.genomeSizeEstimate(minCount)); +// outstream.println("Target Reads: \t"+targetReads); +// outstream.println("Sample Rate: \t"+targetRateS); +// outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); + + t.start(); + outstream.println("\nSubsampling reads."); + +// String kstring=Tools.padKM(sharedHeap.genomeSizeEstimate(minCount), 8); +// outstream.println("Unique Kmers Out: "+kstring); + + +// ArrayList<String> args=new ArrayList<String>(); +// args.add("in="+in1); +// if(in2!=null){args.add("in2="+in2);} +// args.add("out="+out1); +// if(out2!=null){args.add("out2="+out2);} +// args.add("ordered="+ordered); +// args.add("ow="+(overwrite ? "t" : "f")); +// if(targetRate<1){args.add("samplerate="+targetRateS);} +// args.add("loglogout"); +// args.add("loglogk="+k); +// args.add("loglogminprob="+minProb); +// BBDukF.main(args.toArray(new String[0])); + +// Sketch sk=new Sketch(sharedHeap, true, true, null); +// outstream.println(sk.genomeSizeEstimate()); + spawnThreads2(targetRate); + t.stop(); + outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); + + outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); + String kstring=Tools.padKM(sharedHeap.genomeSizeEstimate(minCount), 8); + outstream.println("Unique Kmers Out: "+kstring); + + //Throw an exception of there was an error in a thread + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + /** Spawn process threads */ + private void spawnThreads0(){ + + //Create a read input stream + final ConcurrentReadInputStream cris; + { + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); + cris.start(); //Start the stream + if(verbose){outstream.println("Started cris");} + } + paired=cris.paired(); + if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} + + //Determine how many threads may be used + final int threads=Tools.min(10, Shared.threads()); + + //Fill a list with ProcessThreads + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){ + alpt.add(new ProcessThread(cris, null, i, heapSize)); + } + + //Start the threads + for(ProcessThread pt : alpt){ + pt.start(); + } + + //Wait for completion of all threads + boolean success=true; + for(ProcessThread pt : alpt){ + + //Wait until this thread has terminated + while(pt.getState()!=Thread.State.TERMINATED){ + try { + //Attempt a join operation + pt.join(); + } catch (InterruptedException e) { + //Potentially handle this, if it is expected to occur + e.printStackTrace(); + } + } + + //Accumulate per-thread statistics + readsProcessed+=pt.readsProcessedT; + basesProcessed+=pt.basesProcessedT; + readsOut+=pt.readsOutT; + basesOut+=pt.basesOutT; + success&=pt.success; + } + + //Track whether any threads failed + if(!success){errorState=true;} + + //Do anything necessary after processing + + //Close the read streams + errorState|=ReadWrite.closeStreams(cris); + + } + + /** Spawn process threads */ + private void spawnThreads2(double rate){ + + //Create a read input stream + final ConcurrentReadInputStream cris; + { + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); + cris.setSampleRate((float)rate, seed); + cris.start(); //Start the stream + if(verbose){outstream.println("Started cris");} + } +// paired=cris.paired(); +// if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} + + //Optionally create a read output stream + final ConcurrentReadOutputStream ros; + if(ffout1!=null){ + //Select output buffer size based on whether it needs to be ordered + final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); + + //Notify user of output mode + if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){ + outstream.println("Writing interleaved."); + } + + ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false); + ros.start(); //Start the stream + }else{ros=null;} + + //Determine how many threads may be used + final int threads=Tools.min(10, Shared.threads()); + + sharedHeap.clear(); +// readsProcessed=0; +// basesProcessed=0; + readsOut=0; + basesOut=0; + + //Fill a list with ProcessThreads + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){ + alpt.add(new ProcessThread(cris, ros, i, heapSize)); + } + + //Start the threads + for(ProcessThread pt : alpt){ + pt.start(); + } + + //Wait for completion of all threads + boolean success=true; + for(ProcessThread pt : alpt){ + + //Wait until this thread has terminated + while(pt.getState()!=Thread.State.TERMINATED){ + try { + //Attempt a join operation + pt.join(); + } catch (InterruptedException e) { + //Potentially handle this, if it is expected to occur + e.printStackTrace(); + } + } + + //Accumulate per-thread statistics +// readsProcessed+=pt.readsProcessedT; +// basesProcessed+=pt.basesProcessedT; + readsOut+=pt.readsOutT; + basesOut+=pt.basesOutT; + success&=pt.success; + } + + //Track whether any threads failed + if(!success){errorState=true;} + + //Do anything necessary after processing + + //Write anything that was accumulated by ReadStats + errorState|=ReadStats.writeAll(); + //Close the read streams + errorState|=ReadWrite.closeStreams(cris, ros); + + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public static Sketch capLengthAtCountSum(Sketch sketch0, int max) { + int len=0; + long sum=0; + for(; len<sketch0.keyCounts.length; len++){ + sum=sum+sketch0.keyCounts[len]; + if(sum>max){break;} + } + if(len>=sketch0.length()){return sketch0;} + + long[] keys=Arrays.copyOf(sketch0.keys, len); + int[] counts=Arrays.copyOf(sketch0.keyCounts, len); + +// long[] array_, int[] counts_, int taxID_, long imgID_, long gSizeBases_, long gSizeKmers_, long gSequences_, double probCorrect_, +// String taxName_, String name0_, String fname_, ArrayList<String> meta_ + + Sketch sk=new Sketch(keys, counts, null, null, null, -1, -1, + sketch0.genomeSizeBases, sketch0.genomeSizeKmers, sketch0.genomeSequences, sketch0.probCorrect, + null, null, null, null); + + return sk; + } + + public static long calcTargetReads(Sketch sketch, long targetKmers, int minCount, int trials, long seed){ + final int[] counts0=sketch.keyCounts; + final int[] counts=Arrays.copyOf(counts0, counts0.length); + final long size=sketch.genomeSizeEstimate(minCount); + final long reads=sketch.genomeSequences; + final double targetKmerFraction=targetKmers/(double)size; + if(targetKmerFraction>=1){return reads;} + + final int targetKeys=(int)(targetKmerFraction*counts.length); + final long countSum=Tools.sum(counts0); + assert(countSum<Shared.MAX_ARRAY_LEN) : countSum; +// System.err.println("countsum: "+countSum); + + final IntMap map=new IntMap(0, counts0.length); + final int[] expanded=new int[(int)countSum]; + + long roundSum=0; + final Random randy=Shared.threadLocalRandom(seed); + for(int i=0; i<trials; i++){ + Tools.fill(counts, counts0); +// long rounds=reduceRounds(counts0, counts, minCount, targetKeys, randy); + long rounds=reduceRoundsIM(counts0, expanded, minCount, targetKeys, randy, map); + roundSum+=rounds; + } + double avgRounds=roundSum/(double)trials; +// System.err.println("avgRounds: "+avgRounds); + double targetCountFraction=1-(avgRounds/countSum); +// System.err.println("targetFraction: "+targetCountFraction); + return (long)(targetCountFraction*reads); + } + +// public static int reduceRoundsOld(final int[] counts, final int minCount, final int targetKeys, final Random randy){ +// assert(minCount>=0) : minCount; +// int rounds=0; +// int valid=0; +// for(int x : counts){ +// if(x>=minCount){valid++;} +// } +// +// int len=counts.length; +// System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)+", "+Arrays.toString(counts)); +// for(; valid>targetKeys; rounds++){ +// int pos=randy.nextInt(len); +//// assert(counts[pos]>0) : pos+"/"+len+": "+targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Arrays.toString(counts); +// if(counts[pos]==minCount){valid--;} +// counts[pos]--; +// if(counts[pos]==0){ +// len--;//shrink the array +// System.err.println("len="+len+", counts[len]="+counts[len]); +// System.err.println("pos="+pos+", counts[pos]="+counts[pos]); +// counts[pos]=counts[len];//move the last element to the empty slot +// counts[len]=0; +// if(pos!=len && len>0){ +// assert(counts[pos]>0) : pos+"/"+len+": "+targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Arrays.toString(counts); +// } +// } +// System.err.println(len+", "+pos+": "+Arrays.toString(counts)); +// } +// +// System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)); +// +// return rounds; +// } + + //This can be done faster with bins. + //Each bin contains all kmers with count x. When a bin is hit, one kmer moves to the next bin lower. + //Alternately, expand the array into one physical kmer per count. Store the current counts in an IntMap. Remove key each time. + public static long reduceRounds(final int[] counts0, final int[] counts, final int minCount, final int targetKeys, final Random randy){ + assert(minCount>=0) : minCount; + long rounds=0; + int valid=0; + for(int x : counts){ + if(x>=minCount){valid++;} + } + + int len=counts.length; + final long sum0=Tools.sum(counts); + long sum=sum0; +// System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)+", "+Arrays.toString(counts)); + for(; valid>targetKeys; rounds++){ + long posNum=(Long.MAX_VALUE&randy.nextLong())%sum; + long sum2=0; + int pos=0; + + for(int i=0; i<counts.length; i++){ + int x=counts[i]; + if(x>0){ + sum2+=x; + if(sum2>=posNum){ + pos=i; + break; + } + } + } + +// for(int i=0; i<counts0.length; i++){ +// int x=counts0[i]; +// if(x>0){ +// sum2+=x; +// if(sum2>=posNum){ +// pos=i; +// break; +// } +// } +// } + + sum--; + + assert(counts[pos]>0) : pos+"/"+len+": "+targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Arrays.toString(counts); + if(counts[pos]==minCount){valid--;} + counts[pos]--; + if(counts[pos]==0){ + len--;//shrink the array + } +// System.err.println(len+", "+pos+": "+Arrays.toString(counts)); + } + +// System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)); + + return rounds; + } + + //This can be done faster with bins. + //Each bin contains all kmers with count x. When a bin is hit, one kmer moves to the next bin lower. + //Alternately, expand the array into one physical kmer per count. Store the current counts in an IntMap. Remove key each time. + public static long reduceRoundsIM(final int[] counts0, final int[] expanded, final int minCount, final int targetKeys, final Random randy, final IntMap map){ + assert(minCount>=0) : minCount; + long rounds=0; + int valid=0; + map.clear(); + for(int i=0, k=0; i<counts0.length; i++){ + int x=counts0[i]; +// counts[i]=counts0[i]; + if(x>=minCount){valid++;} + map.put(i, x); + for(int j=0; j<x; j++, k++){ + expanded[k]=i; + } + } + assert(expanded.length==Tools.sum(counts0)); + + int len=expanded.length; +// System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)+", "+Arrays.toString(counts)); + for(; valid>targetKeys; rounds++){ + final int pos=randy.nextInt(len); + final int key=expanded[pos]; + final int x=map.get(key); + assert(x>0); + + + if(x==minCount){valid--;} + map.put(key, x-1); + + len--;//shrink the array + // System.err.println("len="+len+", counts[len]="+counts[len]); + // System.err.println("pos="+pos+", counts[pos]="+counts[pos]); + expanded[pos]=expanded[len];//move the last element to the empty slot + expanded[len]=0; + +// System.err.println(len+", "+pos+": "+Arrays.toString(counts)); + } + +// System.err.println(targetKeys+", "+counts.length+", "+valid+", "+len+", "+rounds+", "+Tools.sum(counts)); + + return rounds; + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Classes ----------------*/ + /*--------------------------------------------------------------*/ + + /** This class is static to prevent accidental writing to shared variables. + * It is safe to remove the static modifier. */ + private class ProcessThread extends Thread { + + //Constructor + ProcessThread(final ConcurrentReadInputStream cris_, final ConcurrentReadOutputStream ros_, final int tid_, final int size){ + cris=cris_; + ros=ros_; + tid=tid_; + localHeap=new SketchHeap(size, 0, true); + } + + //Called by start() + @Override + public void run(){ + //Do anything necessary prior to processing + + //Process the reads + processInner(); + + //Do anything necessary after processing + dumpHeap(); + + //Indicate successful exit status + success=true; + } + + /** Iterate through the reads */ + void processInner(){ + + //Grab the first ListNum of reads + ListNum<Read> ln=cris.nextList(); + //Grab the actual read list from the ListNum + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + //Check to ensure pairing is as expected + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); +// assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access + } + + //As long as there is a nonempty read list... + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning +// if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access + + //Loop through each read in the list + for(int idx=0; idx<reads.size(); idx++){ + final Read r1=reads.get(idx); + final Read r2=r1.mate; + + //Validate reads in worker threads + if(!r1.validated()){r1.validate(true);} + if(r2!=null && !r2.validated()){r2.validate(true);} + + //Track the initial length for statistics + final int initialLength1=r1.length(); + final int initialLength2=r1.mateLength(); + + //Increment counters + readsProcessedT+=r1.pairCount(); + basesProcessedT+=initialLength1+initialLength2; + + //Reads are processed in this block. + processReadPair(r1, r2); + } + + if(ros!=null){ + for(Read r1 : reads){ + readsOutT+=r1.pairCount(); + basesOutT+=r1.pairLength(); + } + + //Output reads to the output stream + if(ros!=null){ros.add(reads, ln.id);} + } + + //Notify the input stream that the list was used + cris.returnList(ln); +// if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access + + //Fetch a new list + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + //Notify the input stream that the final list was used + if(ln!=null){ + if(ln.list!=null){ln.list.clear();} + cris.returnList(ln.id, true); + } + } + + /** + * Process a read or a read pair. + * @param r1 Read 1 + * @param r2 Read 2 (may be null) + */ + void processReadPair(final Read r1, final Read r2){ + processReadNucleotide(r1); + if(r2!=null){processReadNucleotide(r2);} + } + + void processReadNucleotide(final Read r){ + final byte[] bases=r.bases; + final byte[] quals=r.quality; + long kmer=0; + long rkmer=0; + int len=0; + assert(!r.aminoacid()); + + final long min=minHashValue; + localHeap.genomeSizeBases+=r.length(); + localHeap.genomeSequences++; + + if(quals==null || (minProb<=0 && minQual<2)){ + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + long x=AminoAcid.baseToNumber[b]; + long x2=AminoAcid.baseToComplementNumber[b]; + + kmer=((kmer<<2)|x)&mask; + rkmer=((rkmer>>>2)|(x2<<shift2))&mask; + + if(x<0){len=0; rkmer=0;}else{len++;} + if(len>=k){ + localHeap.genomeSizeKmers++; + final long hashcode=hash(kmer, rkmer); + if(hashcode>min){localHeap.add(hashcode);} + } + } + }else{ + float prob=1; + for(int i=0; i<bases.length; i++){ + final byte b=bases[i]; + final long x=AminoAcid.baseToNumber[b]; + final long x2=AminoAcid.baseToComplementNumber[b]; + + {//Quality-related stuff + final byte q=quals[i]; + assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual; + prob=prob*align2.QualityTools.PROB_CORRECT[q]; + if(len>k){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + if(x<0 || q<minQual){ + len=0; + kmer=rkmer=0; + prob=1; + }else{ + len++; + } + } + + kmer=((kmer<<2)|x)&mask; + rkmer=((rkmer>>>2)|(x2<<shift2))&mask; + + if(len>=k && prob>=minProb){ + localHeap.genomeSizeKmers++; + localHeap.probSum+=prob; + final long hashcode=hash(kmer, rkmer); + if(hashcode>min){localHeap.checkAndAdd(hashcode);} + } + } + } + } + + private void dumpHeap(){ + synchronized(sharedHeap){ + sharedHeap.add(localHeap); + } + } + + /** Number of reads processed by this thread */ + protected long readsProcessedT=0; + /** Number of bases processed by this thread */ + protected long basesProcessedT=0; + + /** Number of reads retained by this thread */ + protected long readsOutT=0; + /** Number of bases retained by this thread */ + protected long basesOutT=0; + + /** True only if this thread has completed successfully */ + boolean success=false; + + /** Shared input stream */ + private final ConcurrentReadInputStream cris; + /** Shared output stream */ + private final ConcurrentReadOutputStream ros; + /** Thread ID */ + final int tid; + + final SketchHeap localHeap; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Primary input file path */ + private String in1=null; + /** Secondary input file path */ + private String in2=null; + + private String qfin1=null; + private String qfin2=null; + + /** Primary output file path */ + private String out1=null; + /** Secondary output file path */ + private String out2=null; + + private String qfout1=null; + private String qfout2=null; + + /** Override input file extension */ + private String extin=null; + /** Override output file extension */ + private String extout=null; + + /*--------------------------------------------------------------*/ + + /** Number of reads processed */ + protected long readsProcessed=0; + /** Number of bases processed */ + protected long basesProcessed=0; + + /** Number of reads retained */ + protected long readsOut=0; + /** Number of bases retained */ + protected long basesOut=0; + + /** Quit after processing this many input reads; -1 means no limit */ + private long maxReads=-1; + + private boolean paired=false; + private int trials=25; + private long seed=-1; + private int maxExpandedLength=50000000; + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Primary input file */ + private final FileFormat ffin1; + /** Secondary input file */ + private final FileFormat ffin2; + + /** Primary output file */ + private final FileFormat ffout1; + /** Secondary output file */ + private final FileFormat ffout2; + + private final SketchHeap sharedHeap; + private final int heapSize; + private final long targetKmers; + private final int minCount; + + final int shift; + final int shift2; + final long mask; + + final float minProb; + final byte minQual; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Print status messages to this output stream */ + private PrintStream outstream=System.err; + /** Print verbose messages */ + public static boolean verbose=false; + /** True if an error was encountered */ + public boolean errorState=false; + /** Overwrite existing output files */ + private boolean overwrite=true; + /** Append to existing output files */ + private boolean append=false; + /** Reads are output in input order (not enabled) */ + private boolean ordered=true; + +}