diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/PrintTaxonomy.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/PrintTaxonomy.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,559 @@
+package tax;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+import shared.Parse;
+import shared.Parser;
+import shared.PreParser;
+import shared.ReadStats;
+import shared.Shared;
+import shared.Timer;
+import shared.Tools;
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+import structures.ByteBuilder;
+import structures.ListNum;
+
+/**
+ * Filters sequences according to their taxonomy,
+ * as determined by the sequence name.  Sequences should
+ * be labeled with a gi number or NCBI taxID.
+ * 
+ * @author Brian Bushnell
+ * @date November 23, 2015
+ *
+ */
+public class PrintTaxonomy {
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Initialization        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/**
+	 * Code entrance from the command line.
+	 * @param args Command line arguments
+	 */
+	public static void main(String[] args){
+		Timer t=new Timer();
+		PrintTaxonomy x=new PrintTaxonomy(args);
+		x.process(t);
+		
+		//Close the print stream if it was redirected
+		Shared.closeStream(x.outstream);
+	}
+	
+	/**
+	 * Constructor.
+	 * @param args Command line arguments
+	 */
+	public PrintTaxonomy(String[] args){
+		
+		{//Preparse block for help, config files, and outstream
+			PreParser pp=new PreParser(args, getClass(), false);
+			args=pp.args;
+			outstream=pp.outstream;
+		}
+		
+		//Set shared static variables
+		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+		
+		//Create a parser object
+		Parser parser=new Parser();
+		
+		int taxLevel=0, minLevel=0, maxLevel=TaxTree.LIFE;
+		
+		//Parse each argument
+		for(int i=0; i<args.length; i++){
+			String arg=args[i];
+			
+			//Break arguments into their constituent parts, in the form of "a=b"
+			String[] split=arg.split("=");
+			String a=split[0].toLowerCase();
+			String b=split.length>1 ? split[1] : null;
+			
+			if(a.equals("out")){
+				out1=b;
+			}else if(a.equals("counts")){
+				countFile=b;
+			}else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+				//do nothing
+			}else if(a.equals("verbose")){
+				verbose=Parse.parseBoolean(b);
+			}else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
+				giTableFile=b;
+			}else if(a.equals("accession")){
+				accessionFile=b;
+			}else if(a.equals("tree") || a.equals("taxtree")){
+				taxTreeFile=b;
+			}else if(a.equals("level") || a.equals("lv") || a.equals("taxlevel") || a.equals("tl")){
+				taxLevel=TaxTree.parseLevel(b);
+			}else if(a.equals("minlevel")){
+				minLevel=TaxTree.parseLevel(b);
+			}else if(a.equals("maxlevel")){
+				maxLevel=TaxTree.parseLevel(b);
+			}else if(a.equals("printname")){
+				printName=Parse.parseBoolean(b);
+			}else if(a.equals("reverse")){
+				reverseOrder=Parse.parseBoolean(b);
+			}else if(a.equals("silva")){
+				TaxTree.SILVA_MODE=Parse.parseBoolean(b);
+			}else if(a.equals("unite")){
+				TaxTree.UNITE_MODE=Parse.parseBoolean(b);
+			}else if(a.equals("simple")){
+				skipNonCanonical=Parse.parseBoolean(b);
+			}else if(a.equals("column")){
+				keyColumn=Integer.parseInt(b);
+			}else if(b!=null && (a.equals("name") || a.equals("names") || a.equals("id") || a.equals("ids"))){
+				for(String s : b.split(",")){
+					names.add(s);
+				}
+			}else{
+				names.add(arg);
+			}
+		}
+		
+		if(taxTreeFile==null || "auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();}
+		if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();}
+		if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();}
+		
+		taxLevelExtended=TaxTree.levelToExtended(taxLevel);
+		minLevelExtended=TaxTree.levelToExtended(minLevel);
+		maxLevelExtended=TaxTree.levelToExtended(maxLevel);
+		
+		{//Process parser fields
+			overwrite=ReadStats.overwrite=parser.overwrite;
+			append=ReadStats.append=parser.append;
+
+			in1=parser.in1;
+			maxReads=parser.maxReads;
+		}
+		
+		//Ensure output files can be written
+		if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+			outstream.println((out1==null)+", "+out1);
+			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
+		}
+		
+		//Create output FileFormat objects
+		ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false);
+		
+		ffcount=FileFormat.testOutput(countFile, FileFormat.TEXT, null, true, overwrite, append, false);
+		
+		//Create input FileFormat objects
+		ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, false);
+		
+		if(giTableFile!=null){
+			outstream.println("Loading gi table.");
+			GiToTaxid.initialize(giTableFile);
+		}
+		if(accessionFile!=null){
+			outstream.println("Loading accession table.");
+			AccessionToTaxid.load(accessionFile);
+		}
+		if(taxTreeFile!=null){
+			tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, true);
+			assert(tree.nameMap!=null);
+		}else{
+			tree=null;
+			throw new RuntimeException("No tree specified.");
+		}
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Outer Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+
+	/** Create read streams and process all data */
+	void process(Timer t){
+		
+		TextStreamWriter tsw=null;
+		if(ffout1!=null){
+			tsw=new TextStreamWriter(ffout1);
+			tsw.start();
+		}
+		
+		if(ffin1!=null){
+			if(ffin1.fasta() || ffin1.fastq() || ffin1.samOrBam() || ffin1.scarf()){
+				processReads(tsw);
+			}else{
+				processFile(new TextFile(ffin1), tsw);
+			}
+		}else{
+			processNames(tsw);
+		}
+		
+		if(tsw!=null){errorState|=tsw.poisonAndWait();}
+		
+		if(ffcount!=null){
+			TextStreamWriter tswc=new TextStreamWriter(ffcount);
+			tswc.start();
+			for(TaxNode tn : tree.nodes){
+				if(tn!=null && tn.countRaw>0){
+					tswc.println(tn.countRaw+"\t"+tn.name);
+				}
+			}
+			errorState|=tswc.poisonAndWait();
+		}
+		
+		t.stop();
+		
+		//Throw an exception of there was an error in a thread
+		if(errorState){
+			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+		}
+	}
+	
+	/** Iterate through the names */
+	void processNames(final TextStreamWriter tsw){
+		for(String name : names){
+			if(taxLevelExtended>0){
+				printTaxLevel(name, tsw);
+			}else{
+				printTaxonomy(name, tsw);
+			}
+		}
+	}
+	
+	/** Iterate through the names */
+	void processFile(final TextFile tf, final TextStreamWriter tsw){
+		for(String name=tf.nextLine(); name!=null; name=tf.nextLine()){
+			
+			if(keyColumn>=0){
+				String result=translateLine(name, keyColumn);
+				tsw.print(result);
+			}else if(taxLevelExtended>0){
+				printTaxLevel(name, tsw);
+			}else{
+				printTaxonomy(name, tsw);
+			}
+		}
+	}
+	
+	/** Iterate through the names */
+	void processReads(final TextStreamWriter tsw){
+		final ConcurrentReadInputStream cris;
+		{
+			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+			if(verbose){System.err.println("Started cris");}
+			cris.start();
+		}
+		
+		ListNum<Read> ln=cris.nextList();
+		ArrayList<Read> reads=(ln!=null ? ln.list : null);
+		
+		while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
+
+			for(Read r1 : reads){
+				if(keyColumn>=0){
+					String result=translateLine(r1.id, keyColumn);
+					tsw.println(result);
+				}else if(taxLevelExtended>0){
+					printTaxLevel(r1.id, tsw);
+				}else{
+					printTaxonomy(r1.id, tsw);
+				}
+			}
+			cris.returnList(ln);
+			ln=cris.nextList();
+			reads=(ln!=null ? ln.list : null);
+		}
+		cris.returnList(ln);
+		ReadWrite.closeStreams(cris);
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Inner Methods        ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	String translateLine(String line, int col){
+		StringBuilder sb=new StringBuilder();
+		String[] split=line.split("\t");
+		assert(split.length>col) : "Too few columns in line:\n"+line+"\n->\n"+Arrays.toString(split);
+		
+		if(col<split.length){
+			String name=split[col];
+			while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);}
+			
+			TaxNode tn=parseNodeFromHeader(name);
+			if(tn!=null){
+				String tl=makeTaxLine(tn, minLevelExtended, maxLevelExtended).toString();
+				split[col]=tl;
+			}else{
+				List<TaxNode> list=tree.getNodesByNameExtended(name);
+				if(list!=null){
+					String tab="";
+					for(TaxNode tn2 : list){
+						sb.append(tab);
+						sb.append(makeTaxLine(tn2, minLevelExtended, maxLevelExtended).toString());
+						tab="\t";
+					}
+				}else{
+					split[col]=split[col]+"_***NOT_FOUND***";
+				}
+			}
+		}
+		
+		for(int i=0; i<split.length; i++){
+			if(i>0){sb.append('\t');}
+			sb.append(split[i]);
+		}
+		sb.append('\n');
+		return sb.toString();
+	}
+	
+	void printTaxonomy(String name, final TextStreamWriter tsw){
+		while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);}
+		tsw.print("\n");
+		if(printName){tsw.print(name+":\n");}
+		TaxNode tn=parseNodeFromHeader(name);
+		if(tn!=null){
+			printTaxonomy(tn, tsw);
+			return;
+		}else{
+			List<TaxNode> list=tree.getNodesByNameExtended(name);
+			if(list!=null){
+				String nl="";
+				for(TaxNode tn2 : list){
+					tsw.print(nl);
+					printTaxonomy(tn2, tsw);
+					nl="\n";
+				}
+				return;
+			}
+		}
+		tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'"));
+		return;
+	}
+	
+	void printTaxLevel(String name, final TextStreamWriter tsw){
+		while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);}
+		tsw.print("\n");
+		if(printName){tsw.print(name+":\n");}
+		TaxNode tn=parseNodeFromHeader(name);
+		if(tn!=null){
+			printTaxLevel(tn, tsw);
+			return;
+		}else{
+			List<TaxNode> list=tree.getNodesByNameExtended(name);
+			if(list!=null){
+				for(TaxNode tn2 : list){
+					printTaxLevel(tn2, tsw);
+				}
+				return;
+			}
+		}
+		tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'"));
+		return;
+	}
+	
+//	void printTaxCounts(String name, final TextStreamWriter tsw){
+//		TaxNode tn=null;
+//		tn=tree.getNode(name);
+//		if(tn==null){tn=tree.getNodeByName(name);}
+//		if(tn==null){tn=unknown;}
+//		while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);}
+//		if(tsw!=null)tsw.println(tn.name);
+//		tn.incrementRaw(1);
+//	}
+	
+	void printTaxonomy(TaxNode tn, final TextStreamWriter tsw){
+//		assert(false) : tn.levelExtended+", "+taxLevelExtended+", "+minLevelExtended+", "+maxLevelExtended;
+		assert(tn!=null);
+//		tsw.print("\n");
+		do{
+			if(tn.levelExtended<=taxLevelExtended){tn.incrementRaw(1);}
+			if(tn.levelExtended>=minLevelExtended && tn.levelExtended<=maxLevelExtended){
+				if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
+					tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name);
+				}
+			}
+			tn=tree.getNode(tn.pid);
+		}while(tn!=null && tn.id!=tn.pid);
+	}
+	
+	StringBuilder makeTaxLine(TaxNode tn, int minLevelE, int maxLevelE){
+//		assert(false) : tn+", "+minLevelE+", "+maxLevelE;
+		assert(tn!=null);
+		StringBuilder sb=new StringBuilder();
+		
+		if(reverseOrder){
+			ArrayList<TaxNode> list=new ArrayList<TaxNode>();
+			while(tn.levelExtended<=maxLevelE){
+				if(tn.levelExtended>=minLevelE){
+					if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
+						list.add(tn);
+					}
+				}
+				if(tn.id==tn.pid){break;}
+				tn=tree.getNode(tn.pid);
+			}
+			
+			String semi="";
+			Collections.reverse(list);
+			for(TaxNode tn2 : list){
+				sb.append(semi);
+				sb.append(tn2.levelToStringShort());
+				sb.append("__");
+				sb.append(tn2.name);
+				semi=";";
+			}
+		}else{
+			String semi="";
+			while(tn.levelExtended<=maxLevelE){
+				if(tn.levelExtended>=minLevelE && !tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
+					sb.append(semi);
+					sb.append(tn.levelToStringShort());
+					sb.append("__");
+					sb.append(tn.name);
+					semi=";";
+				}
+				if(tn.id==tn.pid){break;}
+				tn=tree.getNode(tn.pid);
+			}
+		}
+		
+		return sb;
+	}
+	
+//	public static void printTaxonomy(TaxNode tn, final StringBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){
+//		final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel);
+//		assert(tn!=null);
+////		tsw.print("\n");
+//		do{
+//			if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
+//				sb.append(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name+"\n");
+//			}
+//			tn=tree.getNode(tn.pid);
+//		}while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE);
+//	}
+	
+	public static void printTaxonomy(TaxNode tn, final ByteBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){
+		final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel);
+		assert(tn!=null);
+//		tsw.print("\n");
+		do{
+			if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
+				sb.append(tn.levelStringExtended(false)).append('\t').append(tn.id).append('\t').append(tn.name).append('\n');
+			}
+			tn=tree.getNode(tn.pid);
+		}while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE);
+	}
+	
+//	public static void printTaxonomy(TaxNode tn, final TextStreamWriter tsw, final TaxTree tree, final int maxLevel){
+//		final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel);
+//		assert(tn!=null);
+////		tsw.print("\n");
+//		do{
+//			if(!skipNonCanonical || tn.isSimple()){
+//				tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name);
+//			}
+//			tn=tree.getNode(tn.pid);
+//		}while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE);
+//	}
+	
+	void printTaxLevel(TaxNode tn, final TextStreamWriter tsw){
+		if(tn==null){tn=unknown;}
+		while(tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){tn=tree.getNode(tn.pid);}
+		if(tsw!=null){tsw.println(tn.name);}
+		tn.incrementRaw(1);
+	}
+	
+//	void printTaxCounts(TaxNode tn, final TextStreamWriter tsw){
+//		if(tn==null){tn=unknown;}
+//		while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);}
+//		if(tsw!=null)tsw.println(tn.name);
+//		tn.incrementRaw(1);
+//	}
+	
+	public TaxNode parseNodeFromHeader(String header){
+		if(tree==null){return null;}
+		return tree.parseNodeFromHeader(header, true);
+	}
+	
+	/*--------------------------------------------------------------*/
+	/*----------------            Fields            ----------------*/
+	/*--------------------------------------------------------------*/
+
+	/** Optional input file path */
+	private String in1=null;
+
+	/** Primary output file path */
+	private String out1="stdout.txt";
+	
+	private String countFile=null;
+
+	private String giTableFile=null;
+	private String taxTreeFile=null;
+	private String accessionFile=null;
+	
+	private final TaxTree tree;
+	
+//	/** Level to print */
+//	private int taxLevel=-1;//TaxTree.stringToLevel("phylum");
+//
+//	/** Min level to print */
+//	private int minLevel=-1;
+//
+//	/** Max level to print */
+//	private int maxLevel=TaxTree.stringToLevel("life");
+	
+	private final int taxLevelExtended, minLevelExtended, maxLevelExtended;
+	
+	/** Reverse order for tax lines */
+	private boolean reverseOrder=true;
+	
+	private ArrayList<String> names=new ArrayList<String>();
+	
+	private long maxReads=-1;
+	
+	boolean printName=true;
+	boolean skipNonCanonical=false;
+	
+	int keyColumn=-1;
+//	Deprecated.  Description from shellscript:
+//	column=-1       If set to a non-negative integer, parse the taxonomy
+//            information from this column in a tab-delimited file.
+//            Example if column=1:
+//            read1 TAB gi|944259871|gb|KQL24128.1| TAB score:42
+//            becomes
+//            read1 TAB  k__Viridiplantae;p__Streptophyta;... TAB score:42
+	
+	/*--------------------------------------------------------------*/
+	/*----------------         Final Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Optional input file */
+	private final FileFormat ffin1;
+	
+	/** Primary output file */
+	private final FileFormat ffout1;
+	
+	private final FileFormat ffcount;
+	
+	private final TaxNode unknown=new TaxNode(-99, -99, TaxTree.LIFE, TaxTree.LIFE_E, "UNKNOWN");
+	
+	/*--------------------------------------------------------------*/
+	/*----------------        Common Fields         ----------------*/
+	/*--------------------------------------------------------------*/
+	
+	/** Print status messages to this output stream */
+	private PrintStream outstream=System.err;
+	/** Print verbose messages */
+	public static boolean verbose=false;
+	/** True if an error was encountered */
+	public boolean errorState=false;
+	/** Overwrite existing output files */
+	private boolean overwrite=false;
+	/** Append to existing output files */
+	private boolean append=false;
+	
+}