view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/SplitRibo.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package prok;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;

import aligner.SingleStateAlignerFlat2;
import fileIO.ByteFile;
import fileIO.FileFormat;
import fileIO.ReadWrite;
import shared.Parse;
import shared.Parser;
import shared.PreParser;
import shared.ReadStats;
import shared.Shared;
import shared.Timer;
import shared.Tools;
import stream.ConcurrentReadInputStream;
import stream.ConcurrentReadOutputStream;
import stream.FastaReadInputStream;
import stream.Read;
import structures.ListNum;
import template.Accumulator;
import template.ThreadWaiter;

/**
 * Splits a mix of ribosomal sequences (such as Silva) into different files per type (16S, 18S, etc).
 * 
 * @author Brian Bushnell
 * @date November 19, 2015
 *
 */
public class SplitRibo implements Accumulator<SplitRibo.ProcessThread> {
	
	/*--------------------------------------------------------------*/
	/*----------------        Initialization        ----------------*/
	/*--------------------------------------------------------------*/
	
	/**
	 * Code entrance from the command line.
	 * @param args Command line arguments
	 */
	public static void main(String[] args){
		//Start a timer immediately upon code entrance.
		Timer t=new Timer();
		
		//Create an instance of this class
		SplitRibo x=new SplitRibo(args);
		
		//Run the object
		x.process(t);
		
		//Close the print stream if it was redirected
		Shared.closeStream(x.outstream);
	}
	
	/**
	 * Constructor.
	 * @param args Command line arguments
	 */
	public SplitRibo(String[] args){
		
		{//Preparse block for help, config files, and outstream
			PreParser pp=new PreParser(args, getClass(), false);
			args=pp.args;
			outstream=pp.outstream;
		}
		
		//Set shared static variables prior to parsing
		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
		Shared.capBufferLen(50);
		ReadWrite.ZIPLEVEL=9;
		
		{//Parse the arguments
			final Parser parser=parse(args);
			Parser.processQuality();
			
			maxReads=parser.maxReads;
			overwrite=ReadStats.overwrite=parser.overwrite;
			append=ReadStats.append=parser.append;
			
			in1=parser.in1;
			qfin1=parser.qfin1;
			extin=parser.extin;

			outPattern=parser.out1;
			extout=parser.extout;
		}

		validateParams();
		fixExtensions(); //Add or remove .gz or .bz2 as needed
		checkFileExistence(); //Ensure files can be read and written
		checkStatics(); //Adjust file-related static fields as needed for this program 

		//Create input FileFormat objects
		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
		
		numTypes=sequenceTypes.length;
		readsOut=new long[numTypes];
		basesOut=new long[numTypes];
		consensusSequences=loadConsensusSequenceFromFile();
	}
	
	/*--------------------------------------------------------------*/
	/*----------------    Initialization Helpers    ----------------*/
	/*--------------------------------------------------------------*/
	
	/** Parse arguments from the command line */
	private Parser parse(String[] args){
		
		//Create a parser object
		Parser parser=new Parser();
		
		//Set any necessary Parser defaults here
		//parser.foo=bar;
		
		//Parse each argument
		for(int i=0; i<args.length; i++){
			String arg=args[i];
			
			//Break arguments into their constituent parts, in the form of "a=b"
			String[] split=arg.split("=");
			String a=split[0].toLowerCase();
			String b=split.length>1 ? split[1] : null;
			if(b!=null && b.equalsIgnoreCase("null")){b=null;}
			
			if(a.equals("verbose")){
				verbose=Parse.parseBoolean(b);
			}else if(a.equals("ordered")){
				ordered=Parse.parseBoolean(b);
			}else if(a.equalsIgnoreCase("minid")){
				minID=Float.parseFloat(b);
			}else if(a.equalsIgnoreCase("minid2") || a.equalsIgnoreCase("refineid")){
				refineID=Float.parseFloat(b);
			}else if(a.equals("out") || a.equals("pattern") || a.equals("outpattern")){
				parser.out1=b;
			}else if(a.equals("type") || a.equals("types")){
				parseTypes(b);
			}else if(a.equals("parse_flag_goes_here")){
				long fake_variable=Parse.parseKMG(b);
				//Set a variable here
			}else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
				//do nothing
			}else{
				outstream.println("Unknown parameter "+args[i]);
				assert(false) : "Unknown parameter "+args[i];
			}
		}
		
		return parser;
	}
	
	private void parseTypes(String b){
		sequenceTypes=null;
		if(b==null){
			assert(false) : "'types' flag requires a list of types, such as 'types=16S,18S'";
			sequenceTypes=new String[] {"Other"};
		}else{
			String[] split=b.split(",");
			sequenceTypes=new String[split.length+1];
			sequenceTypes[0]="Other";
			for(int i=0; i<split.length; i++){
				String s=split[i].replace('s', 'S');
				if(s.startsWith("its")){s=s.replaceFirst("its", "ITS");}
				sequenceTypes[i+1]=s;
			}
		}
	}
	
	/** Add or remove .gz or .bz2 as needed */
	private void fixExtensions(){
		in1=Tools.fixExtension(in1);
		qfin1=Tools.fixExtension(qfin1);
	}
	
	/** Ensure files can be read and written */
	private void checkFileExistence(){
		
		//Ensure input files can be read
		if(!Tools.testInputFiles(false, true, in1)){
			throw new RuntimeException("\nCan't read some input files.\n");  
		}
		
		if(outPattern==null){return;}
		
		if(!outPattern.contains("#")){
			throw new RuntimeException("OutPattern must contain '#' symbol: "+outPattern);
		}
		
		for(String type : sequenceTypes) {
			String out=outPattern.replaceFirst("#", type);
			
			//Ensure output files can be written
			if(!Tools.testOutputFiles(overwrite, append, false, out)){
				outstream.println((outPattern==null)+", "+(out==null)+", "+outPattern+", "+out);
				throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n");
			}

			//Ensure that no file was specified multiple times
			if(!Tools.testForDuplicateFiles(true, in1, out)){
				throw new RuntimeException("\nSome file names were specified multiple times.\n");
			}
		}
	}
	
	/** Adjust file-related static fields as needed for this program */
	private static void checkStatics(){
		//Adjust the number of threads for input file reading
		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
			ByteFile.FORCE_MODE_BF2=true;
		}
		
		assert(FastaReadInputStream.settingsOK());
	}
	
	/** Ensure parameter ranges are within bounds and required parameters are set */
	private boolean validateParams(){
//		assert(minfoo>0 && minfoo<=maxfoo) : minfoo+", "+maxfoo;
		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
		return true;
	}
	
	private final Read[][] loadConsensusSequenceFromFile(){
		Read[][] seqs=new Read[numTypes][];
		m16S_index=Tools.find("m16S", sequenceTypes);
		m18S_index=Tools.find("m18S", sequenceTypes);
		p16S_index=Tools.find("p16S", sequenceTypes);
		boolean stripM16S=(m16S_index>=0);
		boolean stripM18S=(m18S_index>=0);
		boolean stripP16S=(p16S_index>=0);
		for(int st=1; st<numTypes; st++){
			String name=sequenceTypes[st];
			boolean is16S=name.equalsIgnoreCase("16S");
			boolean is18S=name.equalsIgnoreCase("18S");
			seqs[st]=ProkObject.loadConsensusSequenceType(name, ((is16S && stripM16S) || (is18S && stripM18S)), (is16S && stripP16S));
		}
		return seqs;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------         Outer Methods        ----------------*/
	/*--------------------------------------------------------------*/

	/** Create read streams and process all data */
	void process(Timer t){
		
		//Turn off read validation in the input threads to increase speed
		final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
		Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
		
		//Create a read input stream
		final ConcurrentReadInputStream cris=makeCris();
		
		//Optionally create a read output stream
		final ConcurrentReadOutputStream[] rosa=makeCrosArray();
		
		//Reset counters
		readsProcessed=0;
		basesProcessed=0;
		Arrays.fill(readsOut, 0);
		Arrays.fill(basesOut, 0);
		
		//Process the reads in separate threads
		spawnThreads(cris, rosa);
		
		if(verbose){outstream.println("Finished; closing streams.");}
		
		//Write anything that was accumulated by ReadStats
		errorState|=ReadStats.writeAll();
		//assert(!errorState);
		//Close the read streams
		errorState|=ReadWrite.closeStreams(cris, rosa);
		//assert(!errorState);
		
		//Reset read validation
		Read.VALIDATE_IN_CONSTRUCTOR=vic;

		long readsOut2=Tools.sum(readsOut)-readsOut[0];
		long basesOut2=Tools.sum(basesOut)-basesOut[0];
		
		//Report timing and results
		t.stop();
		outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
		outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut2, basesOut2, 8, true));

		outstream.println();
		outstream.println(Tools.string("Type", "Count", 8));
		for(int type=0; type<numTypes; type++){
			outstream.println(Tools.number(sequenceTypes[type], readsOut[type], 8));
		}
		
		//Throw an exception of there was an error in a thread
		if(errorState){
			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
		}
	}
	
	private ConcurrentReadInputStream makeCris(){
		ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, qfin1, null);
		cris.start(); //Start the stream
		if(verbose){outstream.println("Started cris");}
		return cris;
	}
	
	private ConcurrentReadOutputStream[] makeCrosArray(){
		ConcurrentReadOutputStream[] rosa=new ConcurrentReadOutputStream[numTypes];
		for(int i=0; i<numTypes; i++){
			String type=sequenceTypes[i];
			final ConcurrentReadOutputStream ros=makeCros(type);
			rosa[i]=ros;
		}
		return rosa;
	}
	
	private ConcurrentReadOutputStream makeCros(String type){
		if(outPattern==null){return null;}

		//Select output buffer size based on whether it needs to be ordered
		final int buff=(ordered ? Tools.mid(2, 16, (Shared.threads()*2)/3) : 4);
		final String fname=outPattern.replaceFirst("#", type);
		FileFormat ff=FileFormat.testOutput(fname, FileFormat.FASTA, extout, true, overwrite, append, ordered);

		final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, buff, null, false);
		ros.start(); //Start the stream
		return ros;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------       Thread Management      ----------------*/
	/*--------------------------------------------------------------*/
	
	/** Spawn process threads */
	private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] rosa){
		
		//Do anything necessary prior to processing
		
		//Determine how many threads may be used
		final int threads=Shared.threads();
		
		//Fill a list with ProcessThreads
		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
		for(int i=0; i<threads; i++){
			alpt.add(new ProcessThread(cris, rosa, i));
		}
		
		//Start the threads and wait for them to finish
		boolean success=ThreadWaiter.startAndWait(alpt, this);
		errorState&=!success;
		//assert(!errorState);
		
		//Do anything necessary after processing
		
	}
	
	@Override
	public final void accumulate(ProcessThread pt){
		readsProcessed+=pt.readsProcessedT;
		basesProcessed+=pt.basesProcessedT;
		Tools.add(readsOut, pt.readsOutT);
		Tools.add(basesOut, pt.basesOutT);
		errorState|=(!pt.success);
		//assert(!errorState);
	}
	
	@Override
	public final boolean success(){return !errorState;}
	
	/*--------------------------------------------------------------*/
	/*----------------         Inner Methods        ----------------*/
	/*--------------------------------------------------------------*/
	
	/*--------------------------------------------------------------*/
	/*----------------         Inner Classes        ----------------*/
	/*--------------------------------------------------------------*/
	
	/** This class is static to prevent accidental writing to shared variables.
	 * It is safe to remove the static modifier. */
	class ProcessThread extends Thread {
		
		//Constructor
		ProcessThread(final ConcurrentReadInputStream cris_, final ConcurrentReadOutputStream[] rosa_, final int tid_){
			cris=cris_;
			rosa=rosa_;
			tid=tid_;
		}
		
		//Called by start()
		@Override
		public void run(){
			//Do anything necessary prior to processing
			
			//Process the reads
			processInner();
			
			//Do anything necessary after processing
			
			//Indicate successful exit status
			success=true;
		}
		
		/** Iterate through the reads */
		void processInner(){
			
			//Grab the first ListNum of reads
			ListNum<Read> ln=cris.nextList();

			//Check to ensure pairing is as expected
			if(ln!=null && !ln.isEmpty()){
				Read r=ln.get(0);
				assert(r.mate==null);
			}

			//As long as there is a nonempty read list...
			while(ln!=null && ln.size()>0){
//				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
				
				processList(ln);
				
				//Notify the input stream that the list was used
				cris.returnList(ln);
//				if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
				
				//Fetch a new list
				ln=cris.nextList();
			}

			//Notify the input stream that the final list was used
			if(ln!=null){
				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
			}
		}
		
		void processList(ListNum<Read> ln){

			//Grab the actual read list from the ListNum
			final ArrayList<Read> reads=ln.list;
			
			@SuppressWarnings("unchecked")
			final ArrayList<Read>[] out=new ArrayList[numTypes];
			for(int i=0; i<numTypes; i++){
				ArrayList<Read> list=new ArrayList<Read>(50);
				out[i]=list;
			}
			
			//Loop through each read in the list
			for(int idx=0; idx<reads.size(); idx++){
				final Read r1=reads.get(idx);
				
				//Validate reads in worker threads
				if(!r1.validated()){r1.validate(true);}

				//Track the initial length for statistics
				final int initialLength1=r1.length();
				final int initialLength2=r1.mateLength();

				//Increment counters
				readsProcessedT+=r1.pairCount();
				basesProcessedT+=initialLength1+initialLength2;
				
				{
					//Reads are processed in this block.
					final int type=processRead(r1);
					readsOutT[type]+=r1.pairCount();
					basesOutT[type]+=r1.pairLength();
					out[type].add(r1);
				}
			}

			//Output reads to the output stream
			if(rosa!=null){
				for(int type=0; type<numTypes; type++){
					rosa[type].add(out[type], ln.id);
				}
			}
		}
		
		/**
		 * Process a read.
		 * @param r1 Read 1
		 * @return The best-matching type, or 0 for no matches.
		 */
		private int processRead(final Read r){
			int bestType=0;
			float bestID=-1;
			for(int type=1; type<numTypes; type++){//Align to only the overall consensus
				Read[] refs=consensusSequences[type];
				float id=align(r, refs, 0, 1);
				if(id>bestID && id>=minID){
					bestType=type;
					bestID=id;
				}
			}
			if(bestType<1 || bestID<refineID || bestType==p16S_index){//If nothing met minID, or if it matched chloro, align to clade-specific consensuses
				for(int type=1; type<numTypes; type++){
					Read[] refs=consensusSequences[type];
					float id=align(r, refs, 1, refs.length);
					if(id>bestID && id>=minID){
						bestType=type;
						bestID=id;
					}
				}
			}
			r.obj=bestID;//If desired...  in actuality, more info might be useful, like alignment length
			return bestID<minID ? 0 : bestType;
		}
		
		private float align(Read r, Read[] refs, int minRef, int maxRef){
			float bestID=-1;
			if(refs!=null){
				for(int i=minRef; i<maxRef; i++){
					Read ref=refs[i];
					float id=align(r.bases, ref.bases);
					bestID=Tools.max(id,  bestID);
				}
			}
			return bestID;
		}
		
		private float align(byte[] query, byte[] ref){
			int a=0, b=ref.length-1;
			int[] max=ssa.fillUnlimited(query, ref, a, b, -9999);
			if(max==null){return 0;}
			
			final int rows=max[0];
			final int maxCol=max[1];
			final int maxState=max[2];
			final float id=ssa.tracebackIdentity(query, ref, a, b, rows, maxCol, maxState, null);
			return id;
		}
		
		SingleStateAlignerFlat2 ssa=new SingleStateAlignerFlat2();

		/** Number of reads processed by this thread */
		protected long readsProcessedT=0;
		/** Number of bases processed by this thread */
		protected long basesProcessedT=0;
		
		/** Number of reads retained by this thread */
		protected long[] readsOutT=new long[numTypes];
		/** Number of bases retained by this thread */
		protected long[] basesOutT=new long[numTypes];
		
		/** True only if this thread has completed successfully */
		boolean success=false;
		
		/** Shared input stream */
		private final ConcurrentReadInputStream cris;
		/** Shared output stream */
		private final ConcurrentReadOutputStream[] rosa;
		/** Thread ID */
		final int tid;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------            Fields            ----------------*/
	/*--------------------------------------------------------------*/

	/** Primary input file path */
	private String in1=null;
	
	private String qfin1=null;

	/** Primary output file path */
	private String outPattern=null;
	
	/** Override input file extension */
	private String extin=null;
	/** Override output file extension */
	private String extout=null;

	float minID=0.59f; //This could be a per-type value
	float refineID=0.70f; //Refine alignment if best is less than this
	
	private int m16S_index=-2;
	private int m18S_index=-2;
	private int p16S_index=-2;
	
	/*--------------------------------------------------------------*/
	
	/** Number of reads processed */
	protected long readsProcessed=0;
	/** Number of bases processed */
	protected long basesProcessed=0;
	
	/** Quit after processing this many input reads; -1 means no limit */
	private long maxReads=-1;
	
	private String[] sequenceTypes=new String[] {"Other", "16S", "18S", "23S", "5S", "m16S", "m18S", "p16S"};
	private final int numTypes;//=sequenceTypes.length;
	final Read[][] consensusSequences;
	
	/** Number of reads retained */
	final long[] readsOut;
	/** Number of bases retained */
	final long[] basesOut;
	
	/*--------------------------------------------------------------*/
	/*----------------         Final Fields         ----------------*/
	/*--------------------------------------------------------------*/

	/** Primary input file */
	private final FileFormat ffin1;
	
	/*--------------------------------------------------------------*/
	/*----------------         Static Fields        ----------------*/
	/*--------------------------------------------------------------*/
	
	/*--------------------------------------------------------------*/
	/*----------------        Common Fields         ----------------*/
	/*--------------------------------------------------------------*/
	
	/** Print status messages to this output stream */
	private PrintStream outstream=System.err;
	/** Print verbose messages */
	public static boolean verbose=false;
	/** True if an error was encountered */
	public boolean errorState=false;
	/** Overwrite existing output files */
	private boolean overwrite=false;
	/** Append to existing output files */
	private boolean append=false;
	/** Reads are output in input order */
	private boolean ordered=true;
	
}