diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/SplitRibo.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/SplitRibo.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,626 @@
+package prok;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import aligner.SingleStateAlignerFlat2;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import shared.Parse;
+import shared.Parser;
+import shared.PreParser;
+import shared.ReadStats;
+import shared.Shared;
+import shared.Timer;
+import shared.Tools;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import structures.ListNum;
+import template.Accumulator;
+import template.ThreadWaiter;
+
+/**
+ * Splits a mix of ribosomal sequences (such as Silva) into different files per type (16S, 18S, etc).
+ * 
+ * @author Brian Bushnell
+ * @date November 19, 2015
+ *
+ */
+public class SplitRibo implements Accumulator<SplitRibo.ProcessThread> {
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/**
+	 * Code entrance from the command line.
+	 * @param args Command line arguments
+	 */
+	public static void main(String[] args){
+		//Start a timer immediately upon code entrance.
+		Timer t=new Timer();
+		
+		//Create an instance of this class
+		SplitRibo x=new SplitRibo(args);
+		
+		//Run the object
+		x.process(t);
+		
+		//Close the print stream if it was redirected
+		Shared.closeStream(x.outstream);
+	}
+	
+	/**
+	 * Constructor.
+	 * @param args Command line arguments
+	 */
+	public SplitRibo(String[] args){
+		
+		{//Preparse block for help, config files, and outstream
+			PreParser pp=new PreParser(args, getClass(), false);
+			args=pp.args;
+			outstream=pp.outstream;
+		}
+		
+		//Set shared static variables prior to parsing
+		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+		Shared.capBufferLen(50);
+		ReadWrite.ZIPLEVEL=9;
+		
+		{//Parse the arguments
+			final Parser parser=parse(args);
+			Parser.processQuality();
+			
+			maxReads=parser.maxReads;
+			overwrite=ReadStats.overwrite=parser.overwrite;
+			append=ReadStats.append=parser.append;
+			
+			in1=parser.in1;
+			qfin1=parser.qfin1;
+			extin=parser.extin;
+
+			outPattern=parser.out1;
+			extout=parser.extout;
+		}
+
+		validateParams();
+		fixExtensions(); //Add or remove .gz or .bz2 as needed
+		checkFileExistence(); //Ensure files can be read and written
+		checkStatics(); //Adjust file-related static fields as needed for this program 
+
+		//Create input FileFormat objects
+		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+		
+		numTypes=sequenceTypes.length;
+		readsOut=new long[numTypes];
+		basesOut=new long[numTypes];
+		consensusSequences=loadConsensusSequenceFromFile();
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------    Initialization Helpers    ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Parse arguments from the command line */
+	private Parser parse(String[] args){
+		
+		//Create a parser object
+		Parser parser=new Parser();
+		
+		//Set any necessary Parser defaults here
+		//parser.foo=bar;
+		
+		//Parse each argument
+		for(int i=0; i<args.length; i++){
+			String arg=args[i];
+			
+			//Break arguments into their constituent parts, in the form of "a=b"
+			String[] split=arg.split("=");
+			String a=split[0].toLowerCase();
+			String b=split.length>1 ? split[1] : null;
+			if(b!=null && b.equalsIgnoreCase("null")){b=null;}
+			
+			if(a.equals("verbose")){
+				verbose=Parse.parseBoolean(b);
+			}else if(a.equals("ordered")){
+				ordered=Parse.parseBoolean(b);
+			}else if(a.equalsIgnoreCase("minid")){
+				minID=Float.parseFloat(b);
+			}else if(a.equalsIgnoreCase("minid2") || a.equalsIgnoreCase("refineid")){
+				refineID=Float.parseFloat(b);
+			}else if(a.equals("out") || a.equals("pattern") || a.equals("outpattern")){
+				parser.out1=b;
+			}else if(a.equals("type") || a.equals("types")){
+				parseTypes(b);
+			}else if(a.equals("parse_flag_goes_here")){
+				long fake_variable=Parse.parseKMG(b);
+				//Set a variable here
+			}else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+				//do nothing
+			}else{
+				outstream.println("Unknown parameter "+args[i]);
+				assert(false) : "Unknown parameter "+args[i];
+			}
+		}
+		
+		return parser;
+	}
+	
+	private void parseTypes(String b){
+		sequenceTypes=null;
+		if(b==null){
+			assert(false) : "'types' flag requires a list of types, such as 'types=16S,18S'";
+			sequenceTypes=new String[] {"Other"};
+		}else{
+			String[] split=b.split(",");
+			sequenceTypes=new String[split.length+1];
+			sequenceTypes[0]="Other";
+			for(int i=0; i<split.length; i++){
+				String s=split[i].replace('s', 'S');
+				if(s.startsWith("its")){s=s.replaceFirst("its", "ITS");}
+				sequenceTypes[i+1]=s;
+			}
+		}
+	}
+	
+	/** Add or remove .gz or .bz2 as needed */
+	private void fixExtensions(){
+		in1=Tools.fixExtension(in1);
+		qfin1=Tools.fixExtension(qfin1);
+	}
+	
+	/** Ensure files can be read and written */
+	private void checkFileExistence(){
+		
+		//Ensure input files can be read
+		if(!Tools.testInputFiles(false, true, in1)){
+			throw new RuntimeException("\nCan't read some input files.\n");  
+		}
+		
+		if(outPattern==null){return;}
+		
+		if(!outPattern.contains("#")){
+			throw new RuntimeException("OutPattern must contain '#' symbol: "+outPattern);
+		}
+		
+		for(String type : sequenceTypes) {
+			String out=outPattern.replaceFirst("#", type);
+			
+			//Ensure output files can be written
+			if(!Tools.testOutputFiles(overwrite, append, false, out)){
+				outstream.println((outPattern==null)+", "+(out==null)+", "+outPattern+", "+out);
+				throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n");
+			}
+
+			//Ensure that no file was specified multiple times
+			if(!Tools.testForDuplicateFiles(true, in1, out)){
+				throw new RuntimeException("\nSome file names were specified multiple times.\n");
+			}
+		}
+	}
+	
+	/** Adjust file-related static fields as needed for this program */
+	private static void checkStatics(){
+		//Adjust the number of threads for input file reading
+		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+			ByteFile.FORCE_MODE_BF2=true;
+		}
+		
+		assert(FastaReadInputStream.settingsOK());
+	}
+	
+	/** Ensure parameter ranges are within bounds and required parameters are set */
+	private boolean validateParams(){
+//		assert(minfoo>0 && minfoo<=maxfoo) : minfoo+", "+maxfoo;
+		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
+		return true;
+	}
+	
+	private final Read[][] loadConsensusSequenceFromFile(){
+		Read[][] seqs=new Read[numTypes][];
+		m16S_index=Tools.find("m16S", sequenceTypes);
+		m18S_index=Tools.find("m18S", sequenceTypes);
+		p16S_index=Tools.find("p16S", sequenceTypes);
+		boolean stripM16S=(m16S_index>=0);
+		boolean stripM18S=(m18S_index>=0);
+		boolean stripP16S=(p16S_index>=0);
+		for(int st=1; st<numTypes; st++){
+			String name=sequenceTypes[st];
+			boolean is16S=name.equalsIgnoreCase("16S");
+			boolean is18S=name.equalsIgnoreCase("18S");
+			seqs[st]=ProkObject.loadConsensusSequenceType(name, ((is16S && stripM16S) || (is18S && stripM18S)), (is16S && stripP16S));
+		}
+		return seqs;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Outer Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+
+	/** Create read streams and process all data */
+	void process(Timer t){
+		
+		//Turn off read validation in the input threads to increase speed
+		final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+		Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
+		
+		//Create a read input stream
+		final ConcurrentReadInputStream cris=makeCris();
+		
+		//Optionally create a read output stream
+		final ConcurrentReadOutputStream[] rosa=makeCrosArray();
+		
+		//Reset counters
+		readsProcessed=0;
+		basesProcessed=0;
+		Arrays.fill(readsOut, 0);
+		Arrays.fill(basesOut, 0);
+		
+		//Process the reads in separate threads
+		spawnThreads(cris, rosa);
+		
+		if(verbose){outstream.println("Finished; closing streams.");}
+		
+		//Write anything that was accumulated by ReadStats
+		errorState|=ReadStats.writeAll();
+		//assert(!errorState);
+		//Close the read streams
+		errorState|=ReadWrite.closeStreams(cris, rosa);
+		//assert(!errorState);
+		
+		//Reset read validation
+		Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+		long readsOut2=Tools.sum(readsOut)-readsOut[0];
+		long basesOut2=Tools.sum(basesOut)-basesOut[0];
+		
+		//Report timing and results
+		t.stop();
+		outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
+		outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut2, basesOut2, 8, true));
+
+		outstream.println();
+		outstream.println(Tools.string("Type", "Count", 8));
+		for(int type=0; type<numTypes; type++){
+			outstream.println(Tools.number(sequenceTypes[type], readsOut[type], 8));
+		}
+		
+		//Throw an exception of there was an error in a thread
+		if(errorState){
+			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+		}
+	}
+	
+	private ConcurrentReadInputStream makeCris(){
+		ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, qfin1, null);
+		cris.start(); //Start the stream
+		if(verbose){outstream.println("Started cris");}
+		return cris;
+	}
+	
+	private ConcurrentReadOutputStream[] makeCrosArray(){
+		ConcurrentReadOutputStream[] rosa=new ConcurrentReadOutputStream[numTypes];
+		for(int i=0; i<numTypes; i++){
+			String type=sequenceTypes[i];
+			final ConcurrentReadOutputStream ros=makeCros(type);
+			rosa[i]=ros;
+		}
+		return rosa;
+	}
+	
+	private ConcurrentReadOutputStream makeCros(String type){
+		if(outPattern==null){return null;}
+
+		//Select output buffer size based on whether it needs to be ordered
+		final int buff=(ordered ? Tools.mid(2, 16, (Shared.threads()*2)/3) : 4);
+		final String fname=outPattern.replaceFirst("#", type);
+		FileFormat ff=FileFormat.testOutput(fname, FileFormat.FASTA, extout, true, overwrite, append, ordered);
+
+		final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, buff, null, false);
+		ros.start(); //Start the stream
+		return ros;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------       Thread Management      ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Spawn process threads */
+	private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] rosa){
+		
+		//Do anything necessary prior to processing
+		
+		//Determine how many threads may be used
+		final int threads=Shared.threads();
+		
+		//Fill a list with ProcessThreads
+		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
+		for(int i=0; i<threads; i++){
+			alpt.add(new ProcessThread(cris, rosa, i));
+		}
+		
+		//Start the threads and wait for them to finish
+		boolean success=ThreadWaiter.startAndWait(alpt, this);
+		errorState&=!success;
+		//assert(!errorState);
+		
+		//Do anything necessary after processing
+		
+	}
+	
+	@Override
+	public final void accumulate(ProcessThread pt){
+		readsProcessed+=pt.readsProcessedT;
+		basesProcessed+=pt.basesProcessedT;
+		Tools.add(readsOut, pt.readsOutT);
+		Tools.add(basesOut, pt.basesOutT);
+		errorState|=(!pt.success);
+		//assert(!errorState);
+	}
+	
+	@Override
+	public final boolean success(){return !errorState;}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Classes        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** This class is static to prevent accidental writing to shared variables.
+	 * It is safe to remove the static modifier. */
+	class ProcessThread extends Thread {
+		
+		//Constructor
+		ProcessThread(final ConcurrentReadInputStream cris_, final ConcurrentReadOutputStream[] rosa_, final int tid_){
+			cris=cris_;
+			rosa=rosa_;
+			tid=tid_;
+		}
+		
+		//Called by start()
+		@Override
+		public void run(){
+			//Do anything necessary prior to processing
+			
+			//Process the reads
+			processInner();
+			
+			//Do anything necessary after processing
+			
+			//Indicate successful exit status
+			success=true;
+		}
+		
+		/** Iterate through the reads */
+		void processInner(){
+			
+			//Grab the first ListNum of reads
+			ListNum<Read> ln=cris.nextList();
+
+			//Check to ensure pairing is as expected
+			if(ln!=null && !ln.isEmpty()){
+				Read r=ln.get(0);
+				assert(r.mate==null);
+			}
+
+			//As long as there is a nonempty read list...
+			while(ln!=null && ln.size()>0){
+//				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
+				
+				processList(ln);
+				
+				//Notify the input stream that the list was used
+				cris.returnList(ln);
+//				if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
+				
+				//Fetch a new list
+				ln=cris.nextList();
+			}
+
+			//Notify the input stream that the final list was used
+			if(ln!=null){
+				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+			}
+		}
+		
+		void processList(ListNum<Read> ln){
+
+			//Grab the actual read list from the ListNum
+			final ArrayList<Read> reads=ln.list;
+			
+			@SuppressWarnings("unchecked")
+			final ArrayList<Read>[] out=new ArrayList[numTypes];
+			for(int i=0; i<numTypes; i++){
+				ArrayList<Read> list=new ArrayList<Read>(50);
+				out[i]=list;
+			}
+			
+			//Loop through each read in the list
+			for(int idx=0; idx<reads.size(); idx++){
+				final Read r1=reads.get(idx);
+				
+				//Validate reads in worker threads
+				if(!r1.validated()){r1.validate(true);}
+
+				//Track the initial length for statistics
+				final int initialLength1=r1.length();
+				final int initialLength2=r1.mateLength();
+
+				//Increment counters
+				readsProcessedT+=r1.pairCount();
+				basesProcessedT+=initialLength1+initialLength2;
+				
+				{
+					//Reads are processed in this block.
+					final int type=processRead(r1);
+					readsOutT[type]+=r1.pairCount();
+					basesOutT[type]+=r1.pairLength();
+					out[type].add(r1);
+				}
+			}
+
+			//Output reads to the output stream
+			if(rosa!=null){
+				for(int type=0; type<numTypes; type++){
+					rosa[type].add(out[type], ln.id);
+				}
+			}
+		}
+		
+		/**
+		 * Process a read.
+		 * @param r1 Read 1
+		 * @return The best-matching type, or 0 for no matches.
+		 */
+		private int processRead(final Read r){
+			int bestType=0;
+			float bestID=-1;
+			for(int type=1; type<numTypes; type++){//Align to only the overall consensus
+				Read[] refs=consensusSequences[type];
+				float id=align(r, refs, 0, 1);
+				if(id>bestID && id>=minID){
+					bestType=type;
+					bestID=id;
+				}
+			}
+			if(bestType<1 || bestID<refineID || bestType==p16S_index){//If nothing met minID, or if it matched chloro, align to clade-specific consensuses
+				for(int type=1; type<numTypes; type++){
+					Read[] refs=consensusSequences[type];
+					float id=align(r, refs, 1, refs.length);
+					if(id>bestID && id>=minID){
+						bestType=type;
+						bestID=id;
+					}
+				}
+			}
+			r.obj=bestID;//If desired...  in actuality, more info might be useful, like alignment length
+			return bestID<minID ? 0 : bestType;
+		}
+		
+		private float align(Read r, Read[] refs, int minRef, int maxRef){
+			float bestID=-1;
+			if(refs!=null){
+				for(int i=minRef; i<maxRef; i++){
+					Read ref=refs[i];
+					float id=align(r.bases, ref.bases);
+					bestID=Tools.max(id,  bestID);
+				}
+			}
+			return bestID;
+		}
+		
+		private float align(byte[] query, byte[] ref){
+			int a=0, b=ref.length-1;
+			int[] max=ssa.fillUnlimited(query, ref, a, b, -9999);
+			if(max==null){return 0;}
+			
+			final int rows=max[0];
+			final int maxCol=max[1];
+			final int maxState=max[2];
+			final float id=ssa.tracebackIdentity(query, ref, a, b, rows, maxCol, maxState, null);
+			return id;
+		}
+		
+		SingleStateAlignerFlat2 ssa=new SingleStateAlignerFlat2();
+
+		/** Number of reads processed by this thread */
+		protected long readsProcessedT=0;
+		/** Number of bases processed by this thread */
+		protected long basesProcessedT=0;
+		
+		/** Number of reads retained by this thread */
+		protected long[] readsOutT=new long[numTypes];
+		/** Number of bases retained by this thread */
+		protected long[] basesOutT=new long[numTypes];
+		
+		/** True only if this thread has completed successfully */
+		boolean success=false;
+		
+		/** Shared input stream */
+		private final ConcurrentReadInputStream cris;
+		/** Shared output stream */
+		private final ConcurrentReadOutputStream[] rosa;
+		/** Thread ID */
+		final int tid;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+
+	/** Primary input file path */
+	private String in1=null;
+	
+	private String qfin1=null;
+
+	/** Primary output file path */
+	private String outPattern=null;
+	
+	/** Override input file extension */
+	private String extin=null;
+	/** Override output file extension */
+	private String extout=null;
+
+	float minID=0.59f; //This could be a per-type value
+	float refineID=0.70f; //Refine alignment if best is less than this
+	
+	private int m16S_index=-2;
+	private int m18S_index=-2;
+	private int p16S_index=-2;
+	
+	/*--------------------------------------------------------------*/
+	
+	/** Number of reads processed */
+	protected long readsProcessed=0;
+	/** Number of bases processed */
+	protected long basesProcessed=0;
+	
+	/** Quit after processing this many input reads; -1 means no limit */
+	private long maxReads=-1;
+	
+	private String[] sequenceTypes=new String[] {"Other", "16S", "18S", "23S", "5S", "m16S", "m18S", "p16S"};
+	private final int numTypes;//=sequenceTypes.length;
+	final Read[][] consensusSequences;
+	
+	/** Number of reads retained */
+	final long[] readsOut;
+	/** Number of bases retained */
+	final long[] basesOut;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Final Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+
+	/** Primary input file */
+	private final FileFormat ffin1;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Static Fields        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Common Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Print status messages to this output stream */
+	private PrintStream outstream=System.err;
+	/** Print verbose messages */
+	public static boolean verbose=false;
+	/** True if an error was encountered */
+	public boolean errorState=false;
+	/** Overwrite existing output files */
+	private boolean overwrite=false;
+	/** Append to existing output files */
+	private boolean append=false;
+	/** Reads are output in input order */
+	private boolean ordered=true;
+	
+}