diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/AnalyzeGenes.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/AnalyzeGenes.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,405 @@
+package prok;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Locale;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import shared.Parse;
+import shared.Parser;
+import shared.PreParser;
+import shared.Shared;
+import shared.Timer;
+import shared.Tools;
+import structures.ByteBuilder;
+import structures.IntList;
+
+/**
+ * This class is designed to analyze paired prokaryotic fna and gff files
+ * to calculate the patterns in coding and noncoding frames, start and stop sites.
+ * It outputs a pgm file.
+ * @author Brian Bushnell
+ * @date Sep 27, 2018
+ *
+ */
+public class AnalyzeGenes {
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/**
+	 * Code entrance from the command line.
+	 * @param args Command line arguments
+	 */
+	public static void main(String[] args){
+		//Start a timer immediately upon code entrance.
+		Timer t=new Timer();
+		
+		//Create an instance of this class
+		AnalyzeGenes x=new AnalyzeGenes(args);
+		
+		//Run the object
+		x.process(t);
+		
+		//Close the print stream if it was redirected
+		Shared.closeStream(x.outstream);
+	}
+	
+	/**
+	 * Constructor.
+	 * @param args Command line arguments
+	 */
+	public AnalyzeGenes(String[] args){
+		
+		{//Preparse block for help, config files, and outstream
+			PreParser pp=new PreParser(args, null/*getClass()*/, false);
+			args=pp.args;
+			outstream=pp.outstream;
+		}
+		
+		//Set shared static variables prior to parsing
+		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+		
+		{//Parse the arguments
+			final Parser parser=parse(args);
+			overwrite=parser.overwrite;
+			append=parser.append;
+
+			out=parser.out1;
+		}
+		
+		if(alignRibo){
+			//Load sequences
+			ProkObject.loadConsensusSequenceFromFile(false, false);
+		}
+		
+		fixExtensions(); //Add or remove .gz or .bz2 as needed
+		checkFileExistence(); //Ensure files can be read and written
+		checkStatics(); //Adjust file-related static fields as needed for this program
+		
+		//Determine how many threads may be used
+		threads=Tools.min(fnaList.size(), Shared.threads(), Tools.max(32, Shared.CALC_LOGICAL_PROCESSORS()/2));
+		
+		ffout=FileFormat.testOutput(out, FileFormat.PGM, null, true, overwrite, append, false);
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------    Initialization Helpers    ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Parse arguments from the command line */
+	private Parser parse(String[] args){
+		
+		Parser parser=new Parser();
+		parser.overwrite=overwrite;
+		for(int i=0; i<args.length; i++){
+			String arg=args[i];
+			String[] split=arg.split("=");
+			String a=split[0].toLowerCase();
+			String b=split.length>1 ? split[1] : null;
+			if(b!=null && b.equalsIgnoreCase("null")){b=null;}
+
+//			outstream.println(arg+", "+a+", "+b);
+			if(PGMTools.parseStatic(arg, a, b)){
+				//do nothing
+			}else if(a.equals("in") || a.equals("infna") || a.equals("fnain") || a.equals("fna") || a.equals("ref")){
+				assert(b!=null);
+				Tools.addFiles(b, fnaList);
+			}else if(a.equals("gff") || a.equals("ingff") || a.equals("gffin")){
+				assert(b!=null);
+				Tools.addFiles(b, gffList);
+			}else if(a.equals("verbose")){
+				verbose=Parse.parseBoolean(b);
+				ReadWrite.verbose=verbose;
+			}else if(a.equals("alignribo") || a.equals("align")){
+				alignRibo=Parse.parseBoolean(b);
+			}else if(a.equals("adjustendpoints")){
+				adjustEndpoints=Parse.parseBoolean(b);
+			}
+			
+			else if(ProkObject.parse(arg, a, b)){}
+			
+			else if(parser.parse(arg, a, b)){
+				//do nothing
+			}else if(arg.indexOf('=')<0 && new File(arg).exists() && FileFormat.isFastaFile(arg)){
+				fnaList.add(arg);
+			}else{
+				outstream.println("Unknown parameter "+args[i]);
+				assert(false) : "Unknown parameter "+args[i];
+				//				throw new RuntimeException("Unknown parameter "+args[i]);
+			}
+		}
+
+		if(gffList.isEmpty()){
+			for(String s : fnaList){
+				String prefix=ReadWrite.stripExtension(s);
+				String gff=prefix+".gff";
+				File f=new File(gff);
+				if(!f.exists()){
+					String gz=gff+".gz";
+					f=new File(gz);
+					assert(f.exists() && f.canRead()) : "Can't read file "+gff;
+					gff=gz;
+				}
+				gffList.add(gff);
+			}
+		}
+		assert(gffList.size()==fnaList.size()) : "Number of fna and gff files do not match: "+fnaList.size()+", "+gffList.size();
+		return parser;
+	}
+	
+	/** Add or remove .gz or .bz2 as needed */
+	private void fixExtensions(){
+		fnaList=Tools.fixExtension(fnaList);
+		gffList=Tools.fixExtension(gffList);
+		if(fnaList.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
+	}
+	
+	/** Ensure files can be read and written */
+	private void checkFileExistence(){
+		//Ensure output files can be written
+		if(!Tools.testOutputFiles(overwrite, append, false, out)){
+			outstream.println((out==null)+", "+out);
+			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n");
+		}
+		
+		//Ensure input files can be read
+		ArrayList<String> foo=new ArrayList<String>();
+		foo.addAll(fnaList);
+		foo.addAll(gffList);
+		if(!Tools.testInputFiles(false, true, foo.toArray(new String[0]))){
+			throw new RuntimeException("\nCan't read some input files.\n");  
+		}
+		
+		//Ensure that no file was specified multiple times
+		foo.add(out);
+		if(!Tools.testForDuplicateFiles(true, foo.toArray(new String[0]))){
+			throw new RuntimeException("\nSome file names were specified multiple times.\n");
+		}
+	}
+	
+	/** Adjust file-related static fields as needed for this program */
+	private static void checkStatics(){
+		//Adjust the number of threads for input file reading
+		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+			ByteFile.FORCE_MODE_BF2=true;
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Outer Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	void process(Timer t){
+		
+		final GeneModel pgm;
+		if(Shared.threads()<2 || fnaList.size()<2){
+			pgm=makeModelST();
+		}else{
+			pgm=spawnThreads();
+		}
+		
+		ByteStreamWriter bsw=ByteStreamWriter.makeBSW(ffout);
+		
+		ByteBuilder bb=new ByteBuilder();
+		pgm.appendTo(bb);
+		bytesOut+=bb.length;
+		
+		if(bsw!=null){
+			bsw.addJob(bb);
+			errorState|=bsw.poisonAndWait();
+		}
+		
+		t.stop();
+		
+		outstream.println(timeReadsBasesGenesProcessed(t, pgm.readsProcessed, pgm.basesProcessed, pgm.genesProcessed, pgm.filesProcessed, 8));
+		
+		outstream.println();
+		outstream.println(typesProcessed(pgm, 12));
+		
+		//outstream.println("Bytes Out:         \t"+bytesOut);
+		
+		if(errorState){
+			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+		}
+	}
+	
+	private static String timeReadsBasesGenesProcessed(Timer t, long readsProcessed, long basesProcessed, long genesProcessed, long filesProcessed, int pad){
+		return ("Time:                         \t"+t+"\n"+readsBasesGenesProcessed(t.elapsed, readsProcessed, basesProcessed, genesProcessed, filesProcessed, pad));
+	}
+	
+	private static String readsBasesGenesProcessed(long elapsed, long reads, long bases, long genes, long files, int pad){
+		double rpnano=reads/(double)elapsed;
+		double bpnano=bases/(double)elapsed;
+		double gpnano=genes/(double)elapsed;
+		double fpnano=files/(double)elapsed;
+
+		String rstring=Tools.padKM(reads, pad);
+		String bstring=Tools.padKM(bases, pad);
+		String gstring=Tools.padKM(genes, pad);
+		String fstring=Tools.padKM(files, pad);
+		ByteBuilder sb=new ByteBuilder();
+		sb.append("Files Processed:    ").append(fstring).append(String.format(Locale.ROOT, " \t%.2f  files/sec", fpnano*1000000000)).append('\n');
+		sb.append("Sequences Processed:").append(rstring).append(String.format(Locale.ROOT, " \t%.2fk seqs/sec", rpnano*1000000)).append('\n');
+		sb.append("Genes Processed:    ").append(gstring).append(String.format(Locale.ROOT, " \t%.2fk genes/sec", gpnano*1000000)).append('\n');
+		sb.append("Bases Processed:    ").append(bstring).append(String.format(Locale.ROOT, " \t%.2fm bases/sec", bpnano*1000));
+		return sb.toString();
+	}
+	
+	private static String typesProcessed(GeneModel pgm, int pad){
+		
+		ByteBuilder sb=new ByteBuilder();
+		sb.append("CDS:   "+Tools.padLeft(pgm.statsCDS.lengthCount, pad)).nl();
+		sb.append("tRNA:  "+Tools.padLeft(pgm.statstRNA.lengthCount, pad)).nl();
+		sb.append("16S:   "+Tools.padLeft(pgm.stats16S.lengthCount, pad)).nl();
+		sb.append("23S:   "+Tools.padLeft(pgm.stats23S.lengthCount, pad)).nl();
+		sb.append("5S:    "+Tools.padLeft(pgm.stats5S.lengthCount, pad)).nl();
+		sb.append("18S:   "+Tools.padLeft(pgm.stats18S.lengthCount, pad));
+		return sb.toString();
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	//TODO: Process each file in a thread.
+	private GeneModel makeModelST(){
+		GeneModel pgmSum=new GeneModel(true);
+		
+		for(int i=0; i<fnaList.size(); i++){
+			String fna=fnaList.get(i);
+			String gff=gffList.get(i);
+			pgmSum.process(fna, gff);
+		}
+		return pgmSum;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------       Thread Management      ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Spawn process threads */
+	private GeneModel spawnThreads(){
+		
+		//Do anything necessary prior to processing
+		
+		final AtomicInteger aint=new AtomicInteger(0);
+		
+		//Fill a list with FileThreads
+		ArrayList<FileThread> alpt=new ArrayList<FileThread>(threads);
+		for(int i=0; i<threads; i++){
+			alpt.add(new FileThread(aint));
+		}
+		
+		//Start the threads
+		for(FileThread pt : alpt){
+			pt.start();
+		}
+		
+		//Wait for threads to finish
+		GeneModel pgm=waitForThreads(alpt);
+		
+		//Do anything necessary after processing
+		return pgm;
+	}
+	
+	private GeneModel waitForThreads(ArrayList<FileThread> alpt){
+		
+		GeneModel pgm=new GeneModel(false);
+		
+		//Wait for completion of all threads
+		boolean success=true;
+		for(FileThread pt : alpt){
+			
+			//Wait until this thread has terminated
+			while(pt.getState()!=Thread.State.TERMINATED){
+				try {
+					//Attempt a join operation
+					pt.join();
+				} catch (InterruptedException e) {
+					//Potentially handle this, if it is expected to occur
+					e.printStackTrace();
+				}
+			}
+			
+			//Accumulate per-thread statistics
+			pgm.add(pt.pgm);
+			
+			success&=pt.success;
+			errorState|=pt.errorStateT;
+		}
+		
+		//Track whether any threads failed
+		if(!success){errorState=true;}
+		return pgm;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Classes        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private class FileThread extends Thread {
+		
+		FileThread(AtomicInteger fnum_){
+			fnum=fnum_;
+			pgm=new GeneModel(true);
+		}
+		
+		@Override
+		public void run(){
+			for(int i=fnum.getAndIncrement(); i<fnaList.size(); i=fnum.getAndIncrement()){
+				String fna=fnaList.get(i);
+				String gff=gffList.get(i);
+				errorStateT=pgm.process(fna, gff)|errorState;
+//				System.err.println("Processed "+fna+" in "+this.toString());
+			}
+			success=true;
+		}
+		
+		private final AtomicInteger fnum;
+		private final GeneModel pgm;
+		boolean errorStateT=false;
+		boolean success=false;
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+
+	private ArrayList<String> fnaList=new ArrayList<String>();
+	private ArrayList<String> gffList=new ArrayList<String>();
+	private IntList taxList=new IntList();
+	private String out=null;
+	
+	/*--------------------------------------------------------------*/
+	
+	private long bytesOut=0;
+	static boolean alignRibo=true;
+	static boolean adjustEndpoints=true;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Final Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private final FileFormat ffout;
+	private final int threads;
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Common Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	private PrintStream outstream=System.err;
+	public static boolean verbose=false;
+	public boolean errorState=false;
+	private boolean overwrite=true;
+	private boolean append=false;
+	
+}
+