comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/PrintTaxonomy.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package tax;
2
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Collections;
7 import java.util.List;
8
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import fileIO.TextFile;
12 import fileIO.TextStreamWriter;
13 import shared.Parse;
14 import shared.Parser;
15 import shared.PreParser;
16 import shared.ReadStats;
17 import shared.Shared;
18 import shared.Timer;
19 import shared.Tools;
20 import stream.ConcurrentReadInputStream;
21 import stream.Read;
22 import structures.ByteBuilder;
23 import structures.ListNum;
24
25 /**
26 * Filters sequences according to their taxonomy,
27 * as determined by the sequence name. Sequences should
28 * be labeled with a gi number or NCBI taxID.
29 *
30 * @author Brian Bushnell
31 * @date November 23, 2015
32 *
33 */
34 public class PrintTaxonomy {
35
36 /*--------------------------------------------------------------*/
37 /*---------------- Initialization ----------------*/
38 /*--------------------------------------------------------------*/
39
40 /**
41 * Code entrance from the command line.
42 * @param args Command line arguments
43 */
44 public static void main(String[] args){
45 Timer t=new Timer();
46 PrintTaxonomy x=new PrintTaxonomy(args);
47 x.process(t);
48
49 //Close the print stream if it was redirected
50 Shared.closeStream(x.outstream);
51 }
52
53 /**
54 * Constructor.
55 * @param args Command line arguments
56 */
57 public PrintTaxonomy(String[] args){
58
59 {//Preparse block for help, config files, and outstream
60 PreParser pp=new PreParser(args, getClass(), false);
61 args=pp.args;
62 outstream=pp.outstream;
63 }
64
65 //Set shared static variables
66 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
67 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
68
69 //Create a parser object
70 Parser parser=new Parser();
71
72 int taxLevel=0, minLevel=0, maxLevel=TaxTree.LIFE;
73
74 //Parse each argument
75 for(int i=0; i<args.length; i++){
76 String arg=args[i];
77
78 //Break arguments into their constituent parts, in the form of "a=b"
79 String[] split=arg.split("=");
80 String a=split[0].toLowerCase();
81 String b=split.length>1 ? split[1] : null;
82
83 if(a.equals("out")){
84 out1=b;
85 }else if(a.equals("counts")){
86 countFile=b;
87 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
88 //do nothing
89 }else if(a.equals("verbose")){
90 verbose=Parse.parseBoolean(b);
91 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
92 giTableFile=b;
93 }else if(a.equals("accession")){
94 accessionFile=b;
95 }else if(a.equals("tree") || a.equals("taxtree")){
96 taxTreeFile=b;
97 }else if(a.equals("level") || a.equals("lv") || a.equals("taxlevel") || a.equals("tl")){
98 taxLevel=TaxTree.parseLevel(b);
99 }else if(a.equals("minlevel")){
100 minLevel=TaxTree.parseLevel(b);
101 }else if(a.equals("maxlevel")){
102 maxLevel=TaxTree.parseLevel(b);
103 }else if(a.equals("printname")){
104 printName=Parse.parseBoolean(b);
105 }else if(a.equals("reverse")){
106 reverseOrder=Parse.parseBoolean(b);
107 }else if(a.equals("silva")){
108 TaxTree.SILVA_MODE=Parse.parseBoolean(b);
109 }else if(a.equals("unite")){
110 TaxTree.UNITE_MODE=Parse.parseBoolean(b);
111 }else if(a.equals("simple")){
112 skipNonCanonical=Parse.parseBoolean(b);
113 }else if(a.equals("column")){
114 keyColumn=Integer.parseInt(b);
115 }else if(b!=null && (a.equals("name") || a.equals("names") || a.equals("id") || a.equals("ids"))){
116 for(String s : b.split(",")){
117 names.add(s);
118 }
119 }else{
120 names.add(arg);
121 }
122 }
123
124 if(taxTreeFile==null || "auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();}
125 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();}
126 if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();}
127
128 taxLevelExtended=TaxTree.levelToExtended(taxLevel);
129 minLevelExtended=TaxTree.levelToExtended(minLevel);
130 maxLevelExtended=TaxTree.levelToExtended(maxLevel);
131
132 {//Process parser fields
133 overwrite=ReadStats.overwrite=parser.overwrite;
134 append=ReadStats.append=parser.append;
135
136 in1=parser.in1;
137 maxReads=parser.maxReads;
138 }
139
140 //Ensure output files can be written
141 if(!Tools.testOutputFiles(overwrite, append, false, out1)){
142 outstream.println((out1==null)+", "+out1);
143 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
144 }
145
146 //Create output FileFormat objects
147 ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false);
148
149 ffcount=FileFormat.testOutput(countFile, FileFormat.TEXT, null, true, overwrite, append, false);
150
151 //Create input FileFormat objects
152 ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, false);
153
154 if(giTableFile!=null){
155 outstream.println("Loading gi table.");
156 GiToTaxid.initialize(giTableFile);
157 }
158 if(accessionFile!=null){
159 outstream.println("Loading accession table.");
160 AccessionToTaxid.load(accessionFile);
161 }
162 if(taxTreeFile!=null){
163 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, true);
164 assert(tree.nameMap!=null);
165 }else{
166 tree=null;
167 throw new RuntimeException("No tree specified.");
168 }
169 }
170
171 /*--------------------------------------------------------------*/
172 /*---------------- Outer Methods ----------------*/
173 /*--------------------------------------------------------------*/
174
175 /** Create read streams and process all data */
176 void process(Timer t){
177
178 TextStreamWriter tsw=null;
179 if(ffout1!=null){
180 tsw=new TextStreamWriter(ffout1);
181 tsw.start();
182 }
183
184 if(ffin1!=null){
185 if(ffin1.fasta() || ffin1.fastq() || ffin1.samOrBam() || ffin1.scarf()){
186 processReads(tsw);
187 }else{
188 processFile(new TextFile(ffin1), tsw);
189 }
190 }else{
191 processNames(tsw);
192 }
193
194 if(tsw!=null){errorState|=tsw.poisonAndWait();}
195
196 if(ffcount!=null){
197 TextStreamWriter tswc=new TextStreamWriter(ffcount);
198 tswc.start();
199 for(TaxNode tn : tree.nodes){
200 if(tn!=null && tn.countRaw>0){
201 tswc.println(tn.countRaw+"\t"+tn.name);
202 }
203 }
204 errorState|=tswc.poisonAndWait();
205 }
206
207 t.stop();
208
209 //Throw an exception of there was an error in a thread
210 if(errorState){
211 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
212 }
213 }
214
215 /** Iterate through the names */
216 void processNames(final TextStreamWriter tsw){
217 for(String name : names){
218 if(taxLevelExtended>0){
219 printTaxLevel(name, tsw);
220 }else{
221 printTaxonomy(name, tsw);
222 }
223 }
224 }
225
226 /** Iterate through the names */
227 void processFile(final TextFile tf, final TextStreamWriter tsw){
228 for(String name=tf.nextLine(); name!=null; name=tf.nextLine()){
229
230 if(keyColumn>=0){
231 String result=translateLine(name, keyColumn);
232 tsw.print(result);
233 }else if(taxLevelExtended>0){
234 printTaxLevel(name, tsw);
235 }else{
236 printTaxonomy(name, tsw);
237 }
238 }
239 }
240
241 /** Iterate through the names */
242 void processReads(final TextStreamWriter tsw){
243 final ConcurrentReadInputStream cris;
244 {
245 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
246 if(verbose){System.err.println("Started cris");}
247 cris.start();
248 }
249
250 ListNum<Read> ln=cris.nextList();
251 ArrayList<Read> reads=(ln!=null ? ln.list : null);
252
253 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
254
255 for(Read r1 : reads){
256 if(keyColumn>=0){
257 String result=translateLine(r1.id, keyColumn);
258 tsw.println(result);
259 }else if(taxLevelExtended>0){
260 printTaxLevel(r1.id, tsw);
261 }else{
262 printTaxonomy(r1.id, tsw);
263 }
264 }
265 cris.returnList(ln);
266 ln=cris.nextList();
267 reads=(ln!=null ? ln.list : null);
268 }
269 cris.returnList(ln);
270 ReadWrite.closeStreams(cris);
271 }
272
273 /*--------------------------------------------------------------*/
274 /*---------------- Inner Methods ----------------*/
275 /*--------------------------------------------------------------*/
276
277 String translateLine(String line, int col){
278 StringBuilder sb=new StringBuilder();
279 String[] split=line.split("\t");
280 assert(split.length>col) : "Too few columns in line:\n"+line+"\n->\n"+Arrays.toString(split);
281
282 if(col<split.length){
283 String name=split[col];
284 while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);}
285
286 TaxNode tn=parseNodeFromHeader(name);
287 if(tn!=null){
288 String tl=makeTaxLine(tn, minLevelExtended, maxLevelExtended).toString();
289 split[col]=tl;
290 }else{
291 List<TaxNode> list=tree.getNodesByNameExtended(name);
292 if(list!=null){
293 String tab="";
294 for(TaxNode tn2 : list){
295 sb.append(tab);
296 sb.append(makeTaxLine(tn2, minLevelExtended, maxLevelExtended).toString());
297 tab="\t";
298 }
299 }else{
300 split[col]=split[col]+"_***NOT_FOUND***";
301 }
302 }
303 }
304
305 for(int i=0; i<split.length; i++){
306 if(i>0){sb.append('\t');}
307 sb.append(split[i]);
308 }
309 sb.append('\n');
310 return sb.toString();
311 }
312
313 void printTaxonomy(String name, final TextStreamWriter tsw){
314 while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);}
315 tsw.print("\n");
316 if(printName){tsw.print(name+":\n");}
317 TaxNode tn=parseNodeFromHeader(name);
318 if(tn!=null){
319 printTaxonomy(tn, tsw);
320 return;
321 }else{
322 List<TaxNode> list=tree.getNodesByNameExtended(name);
323 if(list!=null){
324 String nl="";
325 for(TaxNode tn2 : list){
326 tsw.print(nl);
327 printTaxonomy(tn2, tsw);
328 nl="\n";
329 }
330 return;
331 }
332 }
333 tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'"));
334 return;
335 }
336
337 void printTaxLevel(String name, final TextStreamWriter tsw){
338 while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);}
339 tsw.print("\n");
340 if(printName){tsw.print(name+":\n");}
341 TaxNode tn=parseNodeFromHeader(name);
342 if(tn!=null){
343 printTaxLevel(tn, tsw);
344 return;
345 }else{
346 List<TaxNode> list=tree.getNodesByNameExtended(name);
347 if(list!=null){
348 for(TaxNode tn2 : list){
349 printTaxLevel(tn2, tsw);
350 }
351 return;
352 }
353 }
354 tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'"));
355 return;
356 }
357
358 // void printTaxCounts(String name, final TextStreamWriter tsw){
359 // TaxNode tn=null;
360 // tn=tree.getNode(name);
361 // if(tn==null){tn=tree.getNodeByName(name);}
362 // if(tn==null){tn=unknown;}
363 // while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);}
364 // if(tsw!=null)tsw.println(tn.name);
365 // tn.incrementRaw(1);
366 // }
367
368 void printTaxonomy(TaxNode tn, final TextStreamWriter tsw){
369 // assert(false) : tn.levelExtended+", "+taxLevelExtended+", "+minLevelExtended+", "+maxLevelExtended;
370 assert(tn!=null);
371 // tsw.print("\n");
372 do{
373 if(tn.levelExtended<=taxLevelExtended){tn.incrementRaw(1);}
374 if(tn.levelExtended>=minLevelExtended && tn.levelExtended<=maxLevelExtended){
375 if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
376 tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name);
377 }
378 }
379 tn=tree.getNode(tn.pid);
380 }while(tn!=null && tn.id!=tn.pid);
381 }
382
383 StringBuilder makeTaxLine(TaxNode tn, int minLevelE, int maxLevelE){
384 // assert(false) : tn+", "+minLevelE+", "+maxLevelE;
385 assert(tn!=null);
386 StringBuilder sb=new StringBuilder();
387
388 if(reverseOrder){
389 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
390 while(tn.levelExtended<=maxLevelE){
391 if(tn.levelExtended>=minLevelE){
392 if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
393 list.add(tn);
394 }
395 }
396 if(tn.id==tn.pid){break;}
397 tn=tree.getNode(tn.pid);
398 }
399
400 String semi="";
401 Collections.reverse(list);
402 for(TaxNode tn2 : list){
403 sb.append(semi);
404 sb.append(tn2.levelToStringShort());
405 sb.append("__");
406 sb.append(tn2.name);
407 semi=";";
408 }
409 }else{
410 String semi="";
411 while(tn.levelExtended<=maxLevelE){
412 if(tn.levelExtended>=minLevelE && !tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
413 sb.append(semi);
414 sb.append(tn.levelToStringShort());
415 sb.append("__");
416 sb.append(tn.name);
417 semi=";";
418 }
419 if(tn.id==tn.pid){break;}
420 tn=tree.getNode(tn.pid);
421 }
422 }
423
424 return sb;
425 }
426
427 // public static void printTaxonomy(TaxNode tn, final StringBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){
428 // final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel);
429 // assert(tn!=null);
430 //// tsw.print("\n");
431 // do{
432 // if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
433 // sb.append(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name+"\n");
434 // }
435 // tn=tree.getNode(tn.pid);
436 // }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE);
437 // }
438
439 public static void printTaxonomy(TaxNode tn, final ByteBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){
440 final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel);
441 assert(tn!=null);
442 // tsw.print("\n");
443 do{
444 if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){
445 sb.append(tn.levelStringExtended(false)).append('\t').append(tn.id).append('\t').append(tn.name).append('\n');
446 }
447 tn=tree.getNode(tn.pid);
448 }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE);
449 }
450
451 // public static void printTaxonomy(TaxNode tn, final TextStreamWriter tsw, final TaxTree tree, final int maxLevel){
452 // final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel);
453 // assert(tn!=null);
454 //// tsw.print("\n");
455 // do{
456 // if(!skipNonCanonical || tn.isSimple()){
457 // tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name);
458 // }
459 // tn=tree.getNode(tn.pid);
460 // }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE);
461 // }
462
463 void printTaxLevel(TaxNode tn, final TextStreamWriter tsw){
464 if(tn==null){tn=unknown;}
465 while(tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){tn=tree.getNode(tn.pid);}
466 if(tsw!=null){tsw.println(tn.name);}
467 tn.incrementRaw(1);
468 }
469
470 // void printTaxCounts(TaxNode tn, final TextStreamWriter tsw){
471 // if(tn==null){tn=unknown;}
472 // while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);}
473 // if(tsw!=null)tsw.println(tn.name);
474 // tn.incrementRaw(1);
475 // }
476
477 public TaxNode parseNodeFromHeader(String header){
478 if(tree==null){return null;}
479 return tree.parseNodeFromHeader(header, true);
480 }
481
482 /*--------------------------------------------------------------*/
483 /*---------------- Fields ----------------*/
484 /*--------------------------------------------------------------*/
485
486 /** Optional input file path */
487 private String in1=null;
488
489 /** Primary output file path */
490 private String out1="stdout.txt";
491
492 private String countFile=null;
493
494 private String giTableFile=null;
495 private String taxTreeFile=null;
496 private String accessionFile=null;
497
498 private final TaxTree tree;
499
500 // /** Level to print */
501 // private int taxLevel=-1;//TaxTree.stringToLevel("phylum");
502 //
503 // /** Min level to print */
504 // private int minLevel=-1;
505 //
506 // /** Max level to print */
507 // private int maxLevel=TaxTree.stringToLevel("life");
508
509 private final int taxLevelExtended, minLevelExtended, maxLevelExtended;
510
511 /** Reverse order for tax lines */
512 private boolean reverseOrder=true;
513
514 private ArrayList<String> names=new ArrayList<String>();
515
516 private long maxReads=-1;
517
518 boolean printName=true;
519 boolean skipNonCanonical=false;
520
521 int keyColumn=-1;
522 // Deprecated. Description from shellscript:
523 // column=-1 If set to a non-negative integer, parse the taxonomy
524 // information from this column in a tab-delimited file.
525 // Example if column=1:
526 // read1 TAB gi|944259871|gb|KQL24128.1| TAB score:42
527 // becomes
528 // read1 TAB k__Viridiplantae;p__Streptophyta;... TAB score:42
529
530 /*--------------------------------------------------------------*/
531 /*---------------- Final Fields ----------------*/
532 /*--------------------------------------------------------------*/
533
534 /** Optional input file */
535 private final FileFormat ffin1;
536
537 /** Primary output file */
538 private final FileFormat ffout1;
539
540 private final FileFormat ffcount;
541
542 private final TaxNode unknown=new TaxNode(-99, -99, TaxTree.LIFE, TaxTree.LIFE_E, "UNKNOWN");
543
544 /*--------------------------------------------------------------*/
545 /*---------------- Common Fields ----------------*/
546 /*--------------------------------------------------------------*/
547
548 /** Print status messages to this output stream */
549 private PrintStream outstream=System.err;
550 /** Print verbose messages */
551 public static boolean verbose=false;
552 /** True if an error was encountered */
553 public boolean errorState=false;
554 /** Overwrite existing output files */
555 private boolean overwrite=false;
556 /** Append to existing output files */
557 private boolean append=false;
558
559 }