Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/PrintTaxonomy.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/PrintTaxonomy.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,559 @@ +package tax; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.ReadStats; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import stream.ConcurrentReadInputStream; +import stream.Read; +import structures.ByteBuilder; +import structures.ListNum; + +/** + * Filters sequences according to their taxonomy, + * as determined by the sequence name. Sequences should + * be labeled with a gi number or NCBI taxID. + * + * @author Brian Bushnell + * @date November 23, 2015 + * + */ +public class PrintTaxonomy { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + Timer t=new Timer(); + PrintTaxonomy x=new PrintTaxonomy(args); + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public PrintTaxonomy(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + //Set shared static variables + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + + //Create a parser object + Parser parser=new Parser(); + + int taxLevel=0, minLevel=0, maxLevel=TaxTree.LIFE; + + //Parse each argument + for(int i=0; i<args.length; i++){ + String arg=args[i]; + + //Break arguments into their constituent parts, in the form of "a=b" + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + + if(a.equals("out")){ + out1=b; + }else if(a.equals("counts")){ + countFile=b; + }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser + //do nothing + }else if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); + }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){ + giTableFile=b; + }else if(a.equals("accession")){ + accessionFile=b; + }else if(a.equals("tree") || a.equals("taxtree")){ + taxTreeFile=b; + }else if(a.equals("level") || a.equals("lv") || a.equals("taxlevel") || a.equals("tl")){ + taxLevel=TaxTree.parseLevel(b); + }else if(a.equals("minlevel")){ + minLevel=TaxTree.parseLevel(b); + }else if(a.equals("maxlevel")){ + maxLevel=TaxTree.parseLevel(b); + }else if(a.equals("printname")){ + printName=Parse.parseBoolean(b); + }else if(a.equals("reverse")){ + reverseOrder=Parse.parseBoolean(b); + }else if(a.equals("silva")){ + TaxTree.SILVA_MODE=Parse.parseBoolean(b); + }else if(a.equals("unite")){ + TaxTree.UNITE_MODE=Parse.parseBoolean(b); + }else if(a.equals("simple")){ + skipNonCanonical=Parse.parseBoolean(b); + }else if(a.equals("column")){ + keyColumn=Integer.parseInt(b); + }else if(b!=null && (a.equals("name") || a.equals("names") || a.equals("id") || a.equals("ids"))){ + for(String s : b.split(",")){ + names.add(s); + } + }else{ + names.add(arg); + } + } + + if(taxTreeFile==null || "auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();} + if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();} + if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();} + + taxLevelExtended=TaxTree.levelToExtended(taxLevel); + minLevelExtended=TaxTree.levelToExtended(minLevel); + maxLevelExtended=TaxTree.levelToExtended(maxLevel); + + {//Process parser fields + overwrite=ReadStats.overwrite=parser.overwrite; + append=ReadStats.append=parser.append; + + in1=parser.in1; + maxReads=parser.maxReads; + } + + //Ensure output files can be written + if(!Tools.testOutputFiles(overwrite, append, false, out1)){ + outstream.println((out1==null)+", "+out1); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); + } + + //Create output FileFormat objects + ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false); + + ffcount=FileFormat.testOutput(countFile, FileFormat.TEXT, null, true, overwrite, append, false); + + //Create input FileFormat objects + ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, false); + + if(giTableFile!=null){ + outstream.println("Loading gi table."); + GiToTaxid.initialize(giTableFile); + } + if(accessionFile!=null){ + outstream.println("Loading accession table."); + AccessionToTaxid.load(accessionFile); + } + if(taxTreeFile!=null){ + tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, true); + assert(tree.nameMap!=null); + }else{ + tree=null; + throw new RuntimeException("No tree specified."); + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Create read streams and process all data */ + void process(Timer t){ + + TextStreamWriter tsw=null; + if(ffout1!=null){ + tsw=new TextStreamWriter(ffout1); + tsw.start(); + } + + if(ffin1!=null){ + if(ffin1.fasta() || ffin1.fastq() || ffin1.samOrBam() || ffin1.scarf()){ + processReads(tsw); + }else{ + processFile(new TextFile(ffin1), tsw); + } + }else{ + processNames(tsw); + } + + if(tsw!=null){errorState|=tsw.poisonAndWait();} + + if(ffcount!=null){ + TextStreamWriter tswc=new TextStreamWriter(ffcount); + tswc.start(); + for(TaxNode tn : tree.nodes){ + if(tn!=null && tn.countRaw>0){ + tswc.println(tn.countRaw+"\t"+tn.name); + } + } + errorState|=tswc.poisonAndWait(); + } + + t.stop(); + + //Throw an exception of there was an error in a thread + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + /** Iterate through the names */ + void processNames(final TextStreamWriter tsw){ + for(String name : names){ + if(taxLevelExtended>0){ + printTaxLevel(name, tsw); + }else{ + printTaxonomy(name, tsw); + } + } + } + + /** Iterate through the names */ + void processFile(final TextFile tf, final TextStreamWriter tsw){ + for(String name=tf.nextLine(); name!=null; name=tf.nextLine()){ + + if(keyColumn>=0){ + String result=translateLine(name, keyColumn); + tsw.print(result); + }else if(taxLevelExtended>0){ + printTaxLevel(name, tsw); + }else{ + printTaxonomy(name, tsw); + } + } + } + + /** Iterate through the names */ + void processReads(final TextStreamWriter tsw){ + final ConcurrentReadInputStream cris; + { + cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null); + if(verbose){System.err.println("Started cris");} + cris.start(); + } + + ListNum<Read> ln=cris.nextList(); + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + + for(Read r1 : reads){ + if(keyColumn>=0){ + String result=translateLine(r1.id, keyColumn); + tsw.println(result); + }else if(taxLevelExtended>0){ + printTaxLevel(r1.id, tsw); + }else{ + printTaxonomy(r1.id, tsw); + } + } + cris.returnList(ln); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln); + ReadWrite.closeStreams(cris); + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + String translateLine(String line, int col){ + StringBuilder sb=new StringBuilder(); + String[] split=line.split("\t"); + assert(split.length>col) : "Too few columns in line:\n"+line+"\n->\n"+Arrays.toString(split); + + if(col<split.length){ + String name=split[col]; + while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);} + + TaxNode tn=parseNodeFromHeader(name); + if(tn!=null){ + String tl=makeTaxLine(tn, minLevelExtended, maxLevelExtended).toString(); + split[col]=tl; + }else{ + List<TaxNode> list=tree.getNodesByNameExtended(name); + if(list!=null){ + String tab=""; + for(TaxNode tn2 : list){ + sb.append(tab); + sb.append(makeTaxLine(tn2, minLevelExtended, maxLevelExtended).toString()); + tab="\t"; + } + }else{ + split[col]=split[col]+"_***NOT_FOUND***"; + } + } + } + + for(int i=0; i<split.length; i++){ + if(i>0){sb.append('\t');} + sb.append(split[i]); + } + sb.append('\n'); + return sb.toString(); + } + + void printTaxonomy(String name, final TextStreamWriter tsw){ + while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);} + tsw.print("\n"); + if(printName){tsw.print(name+":\n");} + TaxNode tn=parseNodeFromHeader(name); + if(tn!=null){ + printTaxonomy(tn, tsw); + return; + }else{ + List<TaxNode> list=tree.getNodesByNameExtended(name); + if(list!=null){ + String nl=""; + for(TaxNode tn2 : list){ + tsw.print(nl); + printTaxonomy(tn2, tsw); + nl="\n"; + } + return; + } + } + tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'")); + return; + } + + void printTaxLevel(String name, final TextStreamWriter tsw){ + while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);} + tsw.print("\n"); + if(printName){tsw.print(name+":\n");} + TaxNode tn=parseNodeFromHeader(name); + if(tn!=null){ + printTaxLevel(tn, tsw); + return; + }else{ + List<TaxNode> list=tree.getNodesByNameExtended(name); + if(list!=null){ + for(TaxNode tn2 : list){ + printTaxLevel(tn2, tsw); + } + return; + } + } + tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'")); + return; + } + +// void printTaxCounts(String name, final TextStreamWriter tsw){ +// TaxNode tn=null; +// tn=tree.getNode(name); +// if(tn==null){tn=tree.getNodeByName(name);} +// if(tn==null){tn=unknown;} +// while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);} +// if(tsw!=null)tsw.println(tn.name); +// tn.incrementRaw(1); +// } + + void printTaxonomy(TaxNode tn, final TextStreamWriter tsw){ +// assert(false) : tn.levelExtended+", "+taxLevelExtended+", "+minLevelExtended+", "+maxLevelExtended; + assert(tn!=null); +// tsw.print("\n"); + do{ + if(tn.levelExtended<=taxLevelExtended){tn.incrementRaw(1);} + if(tn.levelExtended>=minLevelExtended && tn.levelExtended<=maxLevelExtended){ + if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ + tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name); + } + } + tn=tree.getNode(tn.pid); + }while(tn!=null && tn.id!=tn.pid); + } + + StringBuilder makeTaxLine(TaxNode tn, int minLevelE, int maxLevelE){ +// assert(false) : tn+", "+minLevelE+", "+maxLevelE; + assert(tn!=null); + StringBuilder sb=new StringBuilder(); + + if(reverseOrder){ + ArrayList<TaxNode> list=new ArrayList<TaxNode>(); + while(tn.levelExtended<=maxLevelE){ + if(tn.levelExtended>=minLevelE){ + if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ + list.add(tn); + } + } + if(tn.id==tn.pid){break;} + tn=tree.getNode(tn.pid); + } + + String semi=""; + Collections.reverse(list); + for(TaxNode tn2 : list){ + sb.append(semi); + sb.append(tn2.levelToStringShort()); + sb.append("__"); + sb.append(tn2.name); + semi=";"; + } + }else{ + String semi=""; + while(tn.levelExtended<=maxLevelE){ + if(tn.levelExtended>=minLevelE && !tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ + sb.append(semi); + sb.append(tn.levelToStringShort()); + sb.append("__"); + sb.append(tn.name); + semi=";"; + } + if(tn.id==tn.pid){break;} + tn=tree.getNode(tn.pid); + } + } + + return sb; + } + +// public static void printTaxonomy(TaxNode tn, final StringBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){ +// final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel); +// assert(tn!=null); +//// tsw.print("\n"); +// do{ +// if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ +// sb.append(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name+"\n"); +// } +// tn=tree.getNode(tn.pid); +// }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE); +// } + + public static void printTaxonomy(TaxNode tn, final ByteBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){ + final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel); + assert(tn!=null); +// tsw.print("\n"); + do{ + if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ + sb.append(tn.levelStringExtended(false)).append('\t').append(tn.id).append('\t').append(tn.name).append('\n'); + } + tn=tree.getNode(tn.pid); + }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE); + } + +// public static void printTaxonomy(TaxNode tn, final TextStreamWriter tsw, final TaxTree tree, final int maxLevel){ +// final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel); +// assert(tn!=null); +//// tsw.print("\n"); +// do{ +// if(!skipNonCanonical || tn.isSimple()){ +// tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name); +// } +// tn=tree.getNode(tn.pid); +// }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE); +// } + + void printTaxLevel(TaxNode tn, final TextStreamWriter tsw){ + if(tn==null){tn=unknown;} + while(tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){tn=tree.getNode(tn.pid);} + if(tsw!=null){tsw.println(tn.name);} + tn.incrementRaw(1); + } + +// void printTaxCounts(TaxNode tn, final TextStreamWriter tsw){ +// if(tn==null){tn=unknown;} +// while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);} +// if(tsw!=null)tsw.println(tn.name); +// tn.incrementRaw(1); +// } + + public TaxNode parseNodeFromHeader(String header){ + if(tree==null){return null;} + return tree.parseNodeFromHeader(header, true); + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Optional input file path */ + private String in1=null; + + /** Primary output file path */ + private String out1="stdout.txt"; + + private String countFile=null; + + private String giTableFile=null; + private String taxTreeFile=null; + private String accessionFile=null; + + private final TaxTree tree; + +// /** Level to print */ +// private int taxLevel=-1;//TaxTree.stringToLevel("phylum"); +// +// /** Min level to print */ +// private int minLevel=-1; +// +// /** Max level to print */ +// private int maxLevel=TaxTree.stringToLevel("life"); + + private final int taxLevelExtended, minLevelExtended, maxLevelExtended; + + /** Reverse order for tax lines */ + private boolean reverseOrder=true; + + private ArrayList<String> names=new ArrayList<String>(); + + private long maxReads=-1; + + boolean printName=true; + boolean skipNonCanonical=false; + + int keyColumn=-1; +// Deprecated. Description from shellscript: +// column=-1 If set to a non-negative integer, parse the taxonomy +// information from this column in a tab-delimited file. +// Example if column=1: +// read1 TAB gi|944259871|gb|KQL24128.1| TAB score:42 +// becomes +// read1 TAB k__Viridiplantae;p__Streptophyta;... TAB score:42 + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Optional input file */ + private final FileFormat ffin1; + + /** Primary output file */ + private final FileFormat ffout1; + + private final FileFormat ffcount; + + private final TaxNode unknown=new TaxNode(-99, -99, TaxTree.LIFE, TaxTree.LIFE_E, "UNKNOWN"); + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Print status messages to this output stream */ + private PrintStream outstream=System.err; + /** Print verbose messages */ + public static boolean verbose=false; + /** True if an error was encountered */ + public boolean errorState=false; + /** Overwrite existing output files */ + private boolean overwrite=false; + /** Append to existing output files */ + private boolean append=false; + +}