diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerSplit.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerSplit.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,546 @@
+package clump;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+
+import bloom.KCountArray;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import jgi.BBMerge;
+import shared.KillSwitch;
+import shared.Parse;
+import shared.Parser;
+import shared.PreParser;
+import shared.ReadStats;
+import shared.Shared;
+import shared.Timer;
+import shared.Tools;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+import structures.ListNum;
+import structures.Quantizer;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class KmerSplit {
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/**
+	 * Code entrance from the command line.
+	 * @param args Command line arguments
+	 */
+	public static void main(String[] args){
+		final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ;
+		final boolean oldFInt=FASTQ.FORCE_INTERLEAVED, oldTInt=FASTQ.TEST_INTERLEAVED;
+		final int zl=ReadWrite.ZIPLEVEL;
+		final float ztd=ReadWrite.ZIP_THREAD_MULT;
+		final int mzt=ReadWrite.MAX_ZIP_THREADS;
+		Timer t=new Timer();
+		KmerSplit x=new KmerSplit(args);
+		ReadWrite.ZIPLEVEL=Tools.min(ReadWrite.ZIPLEVEL, maxZipLevel);
+		x.process(t);
+		ReadWrite.USE_PIGZ=pigz;
+		ReadWrite.USE_UNPIGZ=unpigz;
+		ReadWrite.ZIPLEVEL=zl;
+		ReadWrite.ZIP_THREAD_MULT=ztd;
+		ReadWrite.MAX_ZIP_THREADS=mzt;
+		FASTQ.FORCE_INTERLEAVED=oldFInt;
+		FASTQ.TEST_INTERLEAVED=oldTInt;
+		
+		//Close the print stream if it was redirected
+		Shared.closeStream(x.outstream);
+	}
+	
+	/**
+	 * Constructor.
+	 * @param args Command line arguments
+	 */
+	public KmerSplit(String[] args){
+		
+		{//Preparse block for help, config files, and outstream
+			PreParser pp=new PreParser(args, getClass(), false);
+			args=pp.args;
+			outstream=pp.outstream;
+		}
+		
+		ReadWrite.USE_PIGZ=false;
+		ReadWrite.USE_UNPIGZ=true;
+		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+		boolean setInterleaved=false; //Whether it was explicitly set.
+		Parser parser=new Parser();
+		
+		for(int i=0; i<args.length; i++){
+			String arg=args[i];
+			String[] split=arg.split("=");
+			String a=split[0].toLowerCase();
+			String b=split.length>1 ? split[1] : null;
+
+			if(parser.parse(arg, a, b)){
+				//do nothing
+			}else if(a.equals("verbose")){
+				verbose=KmerComparator.verbose=Parse.parseBoolean(b);
+			}else if(a.equals("parse_flag_goes_here")){
+				//Set a variable here
+			}else if(a.equals("k")){
+				k=Integer.parseInt(b);
+				assert(k>0 && k<32);
+			}else if(a.equals("mincount") || a.equals("mincr")){
+				minCount=Integer.parseInt(b);
+			}else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
+				groups=Integer.parseInt(b);
+			}else if(a.equals("rename") || a.equals("addname")){
+				//Do nothing
+//				addName=Parse.parseBoolean(b);
+			}else if(a.equals("shortname") || a.equals("shortnames")){
+				if(b!=null && b.equals("shrink")){
+					shrinkName=true;
+				}else{
+					shrinkName=false;
+					shortName=Parse.parseBoolean(b);
+				}
+			}else if(a.equals("rcomp") || a.equals("reversecomplement")){
+				//ignore rcomp=Parse.parseBoolean(b);
+			}else if(a.equals("condense") || a.equals("consensus") || a.equals("concensus")){//Note the last one is intentionally misspelled
+				//ignore
+			}else if(a.equals("correct") || a.equals("ecc")){
+				//ignore
+			}else if(a.equals("passes")){
+				int x=Integer.parseInt(b);
+//				if(x>1){outstream.println("Warning: KmerSplit does not support multiple passes.");}
+			}
+			
+			else if(a.equals("dedupe")){
+				//ignore
+			}else if(a.equals("entryfilter")){
+				//ignore
+			}else if(a.equals("markduplicates")){
+				//ignore
+			}else if(a.equals("markall")){
+				//ignore
+			}else if(a.equals("addcount") || a.equals("renamebycount")){
+				//ignore
+			}else if(a.equals("optical") || a.equals("opticalonly")){
+				//ignore
+			}else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){
+				//ignore
+			}else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){
+				//ignore
+			}else if(a.equals("scanlimit") || a.equals("scan")){
+				//ignore
+			}else if(a.equals("removeallduplicates") || a.equals("allduplicates")){
+				//ignore
+			}else if(a.equals("allowns")){
+				//ignore
+			}else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){
+				//ignore
+			}else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){
+				//ignore
+			}else if(a.equals("printduplicates")){
+				//ignore
+			}else if(a.equals("dupeidentity")){
+				//ignore
+			}else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){
+				//ignore
+			}
+			
+			else if(a.equals("prefilter")){
+				KmerReduce.prefilter=Parse.parseBoolean(b);
+			}else if(a.equals("ecco")){
+				ecco=Parse.parseBoolean(b);
+			}else if(a.equals("seed")){
+				KmerComparator.defaultSeed=Long.parseLong(b);
+			}else if(a.equals("hashes")){
+				KmerComparator.setHashes(Integer.parseInt(b));
+			}else if(a.equals("border")){
+				KmerComparator.defaultBorder=Integer.parseInt(b);
+			}else if(a.equals("minprob")){
+				KmerComparator.minProb=Float.parseFloat(b);
+			}else if(a.equals("unpair")){
+				unpair=Parse.parseBoolean(b);
+			}else if(a.equals("repair")){
+				//Do nothing
+			}else if(a.equals("namesort") || a.equals("sort")){
+				//Do nothing
+			}else if(a.equals("fetchthreads")){
+				//Do nothing
+			}else if(a.equals("reorder") || a.equals("reorderclumps")){
+				//reorder=Parse.parseBoolean(b);
+			}else if(a.equals("reorderpaired") || a.equals("reorderclumpspaired")){
+//				reorderpaired=Parse.parseBoolean(b);
+			}
+			
+			
+			else if(Clump.parseStatic(arg, a, b)){
+				//Do nothing
+			}
+				
+			else{
+				outstream.println("Unknown parameter "+args[i]);
+				assert(false) : "Unknown parameter "+args[i];
+				//				throw new RuntimeException("Unknown parameter "+args[i]);
+			}
+		}
+		
+		{//Process parser fields
+			Parser.processQuality();
+			
+			maxReads=parser.maxReads;
+			
+			overwrite=ReadStats.overwrite=parser.overwrite;
+			append=ReadStats.append=parser.append;
+
+			setInterleaved=parser.setInterleaved;
+			
+			in1=parser.in1;
+			in2=parser.in2;
+
+			out1=parser.out1;
+			
+			extin=parser.extin;
+			extout=parser.extout;
+		}
+		
+		if(groups>2){ReadWrite.USE_PIGZ=false;}
+		
+		if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+			in2=in1.replace("#", "2");
+			in1=in1.replace("#", "1");
+		}
+		if(in2!=null){
+			if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+		}
+		
+		assert(FastaReadInputStream.settingsOK());
+		
+		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
+		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+			ByteFile.FORCE_MODE_BF2=true;
+		}
+		
+		if(!setInterleaved){
+			assert(in1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\n";
+			if(in2!=null){ //If there are 2 input streams.
+				FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+				outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+			}
+		}
+
+		if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+		
+		if(out1!=null){
+			assert(out1.contains("%"));
+			outArray=new String[groups];
+			for(int i=0; i<groups; i++){
+				outArray[i]=out1.replaceFirst("%", ""+i);
+			}
+			if(!Tools.testOutputFiles(overwrite, append, false, outArray)){
+				outstream.println((out1==null)+", "+out1);
+				throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+			}
+			ffout=new FileFormat[groups];
+			if(groups>1){ReadWrite.setZipThreadMult(Tools.min(0.5f, 2f/(groups+1)));}
+			for(int i=0; i<groups; i++){
+				ffout[i]=FileFormat.testOutput(outArray[i], FileFormat.FASTQ, extout, groups<10, overwrite, append, false);
+			}
+		}else{
+			outArray=null;
+			throw new RuntimeException("out is a required parameter.");
+		}
+
+		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+		ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+	}
+	
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Outer Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+
+	/** Count kmers */
+	void preprocess(){
+		if(minCount>1){
+			table=ClumpTools.getTable(in1, in2, k, minCount);
+		}
+	}
+
+	/** Create read streams and process all data */
+	void process(Timer t){
+		
+		preprocess();
+		
+		final ConcurrentReadInputStream cris;
+		{
+			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+			cris.start();
+			if(verbose){outstream.println("Started cris");}
+		}
+		boolean paired=cris.paired();
+		if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+		if(cris.paired() && (in1==null || !in1.contains(".sam") && !unpair)){
+			outstream.println("Writing interleaved.");
+		}
+
+		final ConcurrentReadOutputStream ros[]=new ConcurrentReadOutputStream[groups];
+		try {
+			for(int i=0; i<groups; i++){
+				final int buff=8;
+
+				assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+				
+				ros[i]=ConcurrentReadOutputStream.getStream(ffout[i], null, null, null, buff, null, false);
+				ros[i].start();
+			}
+		} catch (OutOfMemoryError e) {
+			KillSwitch.memKill(e);
+		}
+		
+		readsProcessed=0;
+		basesProcessed=0;
+		
+		//Process the read stream
+		processInner(cris, ros);
+		
+		errorState|=ReadStats.writeAll();
+		
+		t.stop();
+		
+		outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
+		
+		if(errorState){
+			Clumpify.sharedErrorState=true;
+			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+		}
+	}
+	
+	/** Collect and sort the reads */
+	void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros){
+		if(verbose){outstream.println("Making comparator.");}
+		KmerComparator kc=new KmerComparator(k, false, false);
+		if(verbose){outstream.println("Seed: "+kc.seed);}
+		
+		if(verbose){outstream.println("Splitting reads.");}
+		splitReads(cris, ros, kc);
+		lastMemProcessed=memProcessed;
+		
+		if(verbose){outstream.println("Done!");}
+	}
+	
+	public void splitReads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros, final KmerComparator kc){
+		Timer t=new Timer();
+		if(verbose){t.start("Making hash threads.");}
+		final int threads=Shared.threads();
+		ArrayList<HashThread> alht=new ArrayList<HashThread>(threads);
+		for(int i=0; i<threads; i++){alht.add(new HashThread(i, cris, ros, kc));}
+		
+		if(verbose){outstream.println("Starting threads.");}
+		for(HashThread ht : alht){ht.start();}
+		
+		
+		if(verbose){outstream.println("Waiting for threads.");}
+		/* Wait for threads to die */
+		for(HashThread ht : alht){
+			
+			/* Wait for a thread to die */
+			while(ht.getState()!=Thread.State.TERMINATED){
+				try {
+					ht.join();
+				} catch (InterruptedException e) {
+					e.printStackTrace();
+				}
+			}
+			readsProcessed+=ht.readsProcessedT;
+			basesProcessed+=ht.basesProcessedT;
+			diskProcessed+=ht.diskProcessedT;
+			memProcessed+=ht.memProcessedT;
+		}
+		
+		if(verbose){outstream.println("Closing streams.");}
+		errorState=ReadWrite.closeStreams(cris, ros)|errorState;
+		if(verbose){t.stop("Split time: ");}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Classes        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private class HashThread extends Thread{
+
+		HashThread(int id_, ConcurrentReadInputStream cris_, ConcurrentReadOutputStream[] ros_, KmerComparator kc_){
+			id=id_;
+			cris=cris_;
+			ros=ros_;
+			kc=kc_;
+		}
+
+		@Override
+		public void run(){
+
+			final boolean paired=cris.paired();
+			ListNum<Read> ln=cris.nextList();
+			ArrayList<Read> reads=(ln!=null ? ln.list : null);
+			
+			ArrayList<Read>[] array=new ArrayList[groups];
+			for(int i=0; i<groups; i++){
+				array[i]=new ArrayList<Read>(buffer);
+			}
+			
+			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
+				
+				for(Read r : reads){
+					if(!r.validated()){
+						r.validate(true);
+						if(r.mate!=null){r.mate.validate(true);}
+					}
+					readsProcessedT+=1+r.mateCount();
+					basesProcessedT+=r.length()+r.mateLength();
+					diskProcessedT+=r.countFastqBytes()+r.countMateFastqBytes();
+					memProcessedT+=r.countBytes()+r.countMateBytes()+ReadKey.overhead;
+					if(shrinkName){
+						Clumpify.shrinkName(r);
+						Clumpify.shrinkName(r.mate);
+					}else if(shortName){
+						Clumpify.shortName(r);
+						Clumpify.shortName(r.mate);
+					}
+					
+					if(quantizeQuality){
+						Quantizer.quantize(r, r.mate);
+					}
+				}
+				
+				if(ecco){
+					for(Read r : reads){
+						if(r.mate!=null){BBMerge.findOverlapStrict(r, r.mate, true);}
+					}
+				}
+				
+				ArrayList<Read> hashList=reads;
+				if(paired && unpair){
+					hashList=new ArrayList<Read>(reads.size()*2);
+					for(Read r1 : reads){
+						Read r2=r1.mate;
+						hashList.add(r1);
+						hashList.add(r2);
+						r1.mate=null;
+						r2.mate=null;
+					}
+				}
+				
+				kc.hash(hashList, table, minCount, true);
+				for(Read r : hashList){
+					long kmer=((ReadKey)r.obj).kmer;
+					long code=kc.hash(kmer);
+					int code2=(int)(code%groups);
+					assert(code2>=0 && code2<array.length) : code2+", "+groups+", "+array.length+", "+kmer+", "+r.obj+"\n"+r;
+					array[code2].add(r);
+					if(array[code2].size()>=buffer){
+						ros[code2].add(array[code2], 0);
+						array[code2]=new ArrayList<Read>(buffer);
+					}
+				}
+				cris.returnList(ln);
+				ln=cris.nextList();
+				reads=(ln!=null ? ln.list : null);
+			}
+			if(ln!=null){
+				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+			}
+			for(int i=0; i<groups; i++){
+				if(!array[i].isEmpty()){
+					ros[i].add(array[i], 0);
+				}
+			}
+		}
+
+		final int id;
+		final ConcurrentReadInputStream cris;
+		final ConcurrentReadOutputStream[] ros;
+		final KmerComparator kc;
+		static final int buffer=200;
+		
+		protected long readsProcessedT=0;
+		protected long basesProcessedT=0;
+		protected long diskProcessedT=0;
+		protected long memProcessedT=0;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private int k=31;
+	int groups=16;
+	int minCount=0;
+	
+	KCountArray table=null;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------          I/O Fields          ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private String in1=null;
+	private String in2=null;
+
+	private String out1=null;
+	private String[] outArray=null;
+	
+	private String extin=null;
+	private String extout=null;
+	
+	/*--------------------------------------------------------------*/
+	
+	protected long readsProcessed=0;
+	protected long basesProcessed=0;
+	protected long diskProcessed=0;
+	protected long memProcessed=0;
+	
+	protected static long lastMemProcessed=0;
+	
+	private long maxReads=-1;
+//	private boolean addName=false;
+	boolean shortName=false;
+	boolean shrinkName=false;
+	boolean ecco=false;
+	boolean unpair=false;
+	
+	static int maxZipLevel=2;
+
+	static boolean quantizeQuality=false;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Final Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private final FileFormat ffin1;
+	private final FileFormat ffin2;
+	
+	private final FileFormat[] ffout;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Common Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private PrintStream outstream=System.err;
+	public static boolean verbose=false;
+	public boolean errorState=false;
+	private boolean overwrite=false;
+	private boolean append=false;
+	
+}