Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerSplit.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerSplit.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,546 @@ +package clump; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; + +import bloom.KCountArray; +import fileIO.ByteFile; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import jgi.BBMerge; +import shared.KillSwitch; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.ReadStats; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import stream.ConcurrentReadInputStream; +import stream.ConcurrentReadOutputStream; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; +import structures.ListNum; +import structures.Quantizer; + +/** + * @author Brian Bushnell + * @date June 20, 2014 + * + */ +public class KmerSplit { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ; + final boolean oldFInt=FASTQ.FORCE_INTERLEAVED, oldTInt=FASTQ.TEST_INTERLEAVED; + final int zl=ReadWrite.ZIPLEVEL; + final float ztd=ReadWrite.ZIP_THREAD_MULT; + final int mzt=ReadWrite.MAX_ZIP_THREADS; + Timer t=new Timer(); + KmerSplit x=new KmerSplit(args); + ReadWrite.ZIPLEVEL=Tools.min(ReadWrite.ZIPLEVEL, maxZipLevel); + x.process(t); + ReadWrite.USE_PIGZ=pigz; + ReadWrite.USE_UNPIGZ=unpigz; + ReadWrite.ZIPLEVEL=zl; + ReadWrite.ZIP_THREAD_MULT=ztd; + ReadWrite.MAX_ZIP_THREADS=mzt; + FASTQ.FORCE_INTERLEAVED=oldFInt; + FASTQ.TEST_INTERLEAVED=oldTInt; + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public KmerSplit(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + ReadWrite.USE_PIGZ=false; + ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + + boolean setInterleaved=false; //Whether it was explicitly set. + Parser parser=new Parser(); + + for(int i=0; i<args.length; i++){ + String arg=args[i]; + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + + if(parser.parse(arg, a, b)){ + //do nothing + }else if(a.equals("verbose")){ + verbose=KmerComparator.verbose=Parse.parseBoolean(b); + }else if(a.equals("parse_flag_goes_here")){ + //Set a variable here + }else if(a.equals("k")){ + k=Integer.parseInt(b); + assert(k>0 && k<32); + }else if(a.equals("mincount") || a.equals("mincr")){ + minCount=Integer.parseInt(b); + }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){ + groups=Integer.parseInt(b); + }else if(a.equals("rename") || a.equals("addname")){ + //Do nothing +// addName=Parse.parseBoolean(b); + }else if(a.equals("shortname") || a.equals("shortnames")){ + if(b!=null && b.equals("shrink")){ + shrinkName=true; + }else{ + shrinkName=false; + shortName=Parse.parseBoolean(b); + } + }else if(a.equals("rcomp") || a.equals("reversecomplement")){ + //ignore rcomp=Parse.parseBoolean(b); + }else if(a.equals("condense") || a.equals("consensus") || a.equals("concensus")){//Note the last one is intentionally misspelled + //ignore + }else if(a.equals("correct") || a.equals("ecc")){ + //ignore + }else if(a.equals("passes")){ + int x=Integer.parseInt(b); +// if(x>1){outstream.println("Warning: KmerSplit does not support multiple passes.");} + } + + else if(a.equals("dedupe")){ + //ignore + }else if(a.equals("entryfilter")){ + //ignore + }else if(a.equals("markduplicates")){ + //ignore + }else if(a.equals("markall")){ + //ignore + }else if(a.equals("addcount") || a.equals("renamebycount")){ + //ignore + }else if(a.equals("optical") || a.equals("opticalonly")){ + //ignore + }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){ + //ignore + }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){ + //ignore + }else if(a.equals("scanlimit") || a.equals("scan")){ + //ignore + }else if(a.equals("removeallduplicates") || a.equals("allduplicates")){ + //ignore + }else if(a.equals("allowns")){ + //ignore + }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){ + //ignore + }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){ + //ignore + }else if(a.equals("printduplicates")){ + //ignore + }else if(a.equals("dupeidentity")){ + //ignore + }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){ + //ignore + } + + else if(a.equals("prefilter")){ + KmerReduce.prefilter=Parse.parseBoolean(b); + }else if(a.equals("ecco")){ + ecco=Parse.parseBoolean(b); + }else if(a.equals("seed")){ + KmerComparator.defaultSeed=Long.parseLong(b); + }else if(a.equals("hashes")){ + KmerComparator.setHashes(Integer.parseInt(b)); + }else if(a.equals("border")){ + KmerComparator.defaultBorder=Integer.parseInt(b); + }else if(a.equals("minprob")){ + KmerComparator.minProb=Float.parseFloat(b); + }else if(a.equals("unpair")){ + unpair=Parse.parseBoolean(b); + }else if(a.equals("repair")){ + //Do nothing + }else if(a.equals("namesort") || a.equals("sort")){ + //Do nothing + }else if(a.equals("fetchthreads")){ + //Do nothing + }else if(a.equals("reorder") || a.equals("reorderclumps")){ + //reorder=Parse.parseBoolean(b); + }else if(a.equals("reorderpaired") || a.equals("reorderclumpspaired")){ +// reorderpaired=Parse.parseBoolean(b); + } + + + else if(Clump.parseStatic(arg, a, b)){ + //Do nothing + } + + else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + {//Process parser fields + Parser.processQuality(); + + maxReads=parser.maxReads; + + overwrite=ReadStats.overwrite=parser.overwrite; + append=ReadStats.append=parser.append; + + setInterleaved=parser.setInterleaved; + + in1=parser.in1; + in2=parser.in2; + + out1=parser.out1; + + extin=parser.extin; + extout=parser.extout; + } + + if(groups>2){ReadWrite.USE_PIGZ=false;} + + if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ + in2=in1.replace("#", "2"); + in1=in1.replace("#", "1"); + } + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ + ByteFile.FORCE_MODE_BF2=true; + } + + if(!setInterleaved){ + assert(in1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\n"; + if(in2!=null){ //If there are 2 input streams. + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + } + + if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} + + if(out1!=null){ + assert(out1.contains("%")); + outArray=new String[groups]; + for(int i=0; i<groups; i++){ + outArray[i]=out1.replaceFirst("%", ""+i); + } + if(!Tools.testOutputFiles(overwrite, append, false, outArray)){ + outstream.println((out1==null)+", "+out1); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n"); + } + ffout=new FileFormat[groups]; + if(groups>1){ReadWrite.setZipThreadMult(Tools.min(0.5f, 2f/(groups+1)));} + for(int i=0; i<groups; i++){ + ffout[i]=FileFormat.testOutput(outArray[i], FileFormat.FASTQ, extout, groups<10, overwrite, append, false); + } + }else{ + outArray=null; + throw new RuntimeException("out is a required parameter."); + } + + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); + } + + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Count kmers */ + void preprocess(){ + if(minCount>1){ + table=ClumpTools.getTable(in1, in2, k, minCount); + } + } + + /** Create read streams and process all data */ + void process(Timer t){ + + preprocess(); + + final ConcurrentReadInputStream cris; + { + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null); + cris.start(); + if(verbose){outstream.println("Started cris");} + } + boolean paired=cris.paired(); + if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} + if(cris.paired() && (in1==null || !in1.contains(".sam") && !unpair)){ + outstream.println("Writing interleaved."); + } + + final ConcurrentReadOutputStream ros[]=new ConcurrentReadOutputStream[groups]; + try { + for(int i=0; i<groups; i++){ + final int buff=8; + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name."; + + ros[i]=ConcurrentReadOutputStream.getStream(ffout[i], null, null, null, buff, null, false); + ros[i].start(); + } + } catch (OutOfMemoryError e) { + KillSwitch.memKill(e); + } + + readsProcessed=0; + basesProcessed=0; + + //Process the read stream + processInner(cris, ros); + + errorState|=ReadStats.writeAll(); + + t.stop(); + + outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); + + if(errorState){ + Clumpify.sharedErrorState=true; + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + /** Collect and sort the reads */ + void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros){ + if(verbose){outstream.println("Making comparator.");} + KmerComparator kc=new KmerComparator(k, false, false); + if(verbose){outstream.println("Seed: "+kc.seed);} + + if(verbose){outstream.println("Splitting reads.");} + splitReads(cris, ros, kc); + lastMemProcessed=memProcessed; + + if(verbose){outstream.println("Done!");} + } + + public void splitReads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros, final KmerComparator kc){ + Timer t=new Timer(); + if(verbose){t.start("Making hash threads.");} + final int threads=Shared.threads(); + ArrayList<HashThread> alht=new ArrayList<HashThread>(threads); + for(int i=0; i<threads; i++){alht.add(new HashThread(i, cris, ros, kc));} + + if(verbose){outstream.println("Starting threads.");} + for(HashThread ht : alht){ht.start();} + + + if(verbose){outstream.println("Waiting for threads.");} + /* Wait for threads to die */ + for(HashThread ht : alht){ + + /* Wait for a thread to die */ + while(ht.getState()!=Thread.State.TERMINATED){ + try { + ht.join(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + readsProcessed+=ht.readsProcessedT; + basesProcessed+=ht.basesProcessedT; + diskProcessed+=ht.diskProcessedT; + memProcessed+=ht.memProcessedT; + } + + if(verbose){outstream.println("Closing streams.");} + errorState=ReadWrite.closeStreams(cris, ros)|errorState; + if(verbose){t.stop("Split time: ");} + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /*--------------------------------------------------------------*/ + /*---------------- Inner Classes ----------------*/ + /*--------------------------------------------------------------*/ + + private class HashThread extends Thread{ + + HashThread(int id_, ConcurrentReadInputStream cris_, ConcurrentReadOutputStream[] ros_, KmerComparator kc_){ + id=id_; + cris=cris_; + ros=ros_; + kc=kc_; + } + + @Override + public void run(){ + + final boolean paired=cris.paired(); + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + ArrayList<Read>[] array=new ArrayList[groups]; + for(int i=0; i<groups; i++){ + array[i]=new ArrayList<Read>(buffer); + } + + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + + for(Read r : reads){ + if(!r.validated()){ + r.validate(true); + if(r.mate!=null){r.mate.validate(true);} + } + readsProcessedT+=1+r.mateCount(); + basesProcessedT+=r.length()+r.mateLength(); + diskProcessedT+=r.countFastqBytes()+r.countMateFastqBytes(); + memProcessedT+=r.countBytes()+r.countMateBytes()+ReadKey.overhead; + if(shrinkName){ + Clumpify.shrinkName(r); + Clumpify.shrinkName(r.mate); + }else if(shortName){ + Clumpify.shortName(r); + Clumpify.shortName(r.mate); + } + + if(quantizeQuality){ + Quantizer.quantize(r, r.mate); + } + } + + if(ecco){ + for(Read r : reads){ + if(r.mate!=null){BBMerge.findOverlapStrict(r, r.mate, true);} + } + } + + ArrayList<Read> hashList=reads; + if(paired && unpair){ + hashList=new ArrayList<Read>(reads.size()*2); + for(Read r1 : reads){ + Read r2=r1.mate; + hashList.add(r1); + hashList.add(r2); + r1.mate=null; + r2.mate=null; + } + } + + kc.hash(hashList, table, minCount, true); + for(Read r : hashList){ + long kmer=((ReadKey)r.obj).kmer; + long code=kc.hash(kmer); + int code2=(int)(code%groups); + assert(code2>=0 && code2<array.length) : code2+", "+groups+", "+array.length+", "+kmer+", "+r.obj+"\n"+r; + array[code2].add(r); + if(array[code2].size()>=buffer){ + ros[code2].add(array[code2], 0); + array[code2]=new ArrayList<Read>(buffer); + } + } + cris.returnList(ln); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(ln!=null){ + cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); + } + for(int i=0; i<groups; i++){ + if(!array[i].isEmpty()){ + ros[i].add(array[i], 0); + } + } + } + + final int id; + final ConcurrentReadInputStream cris; + final ConcurrentReadOutputStream[] ros; + final KmerComparator kc; + static final int buffer=200; + + protected long readsProcessedT=0; + protected long basesProcessedT=0; + protected long diskProcessedT=0; + protected long memProcessedT=0; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private int k=31; + int groups=16; + int minCount=0; + + KCountArray table=null; + + /*--------------------------------------------------------------*/ + /*---------------- I/O Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private String in1=null; + private String in2=null; + + private String out1=null; + private String[] outArray=null; + + private String extin=null; + private String extout=null; + + /*--------------------------------------------------------------*/ + + protected long readsProcessed=0; + protected long basesProcessed=0; + protected long diskProcessed=0; + protected long memProcessed=0; + + protected static long lastMemProcessed=0; + + private long maxReads=-1; +// private boolean addName=false; + boolean shortName=false; + boolean shrinkName=false; + boolean ecco=false; + boolean unpair=false; + + static int maxZipLevel=2; + + static boolean quantizeQuality=false; + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private final FileFormat ffin1; + private final FileFormat ffin2; + + private final FileFormat[] ffout; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + public static boolean verbose=false; + public boolean errorState=false; + private boolean overwrite=false; + private boolean append=false; + +}