Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/PrintTaxonomy.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package tax; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 import java.util.Collections; | |
7 import java.util.List; | |
8 | |
9 import fileIO.FileFormat; | |
10 import fileIO.ReadWrite; | |
11 import fileIO.TextFile; | |
12 import fileIO.TextStreamWriter; | |
13 import shared.Parse; | |
14 import shared.Parser; | |
15 import shared.PreParser; | |
16 import shared.ReadStats; | |
17 import shared.Shared; | |
18 import shared.Timer; | |
19 import shared.Tools; | |
20 import stream.ConcurrentReadInputStream; | |
21 import stream.Read; | |
22 import structures.ByteBuilder; | |
23 import structures.ListNum; | |
24 | |
25 /** | |
26 * Filters sequences according to their taxonomy, | |
27 * as determined by the sequence name. Sequences should | |
28 * be labeled with a gi number or NCBI taxID. | |
29 * | |
30 * @author Brian Bushnell | |
31 * @date November 23, 2015 | |
32 * | |
33 */ | |
34 public class PrintTaxonomy { | |
35 | |
36 /*--------------------------------------------------------------*/ | |
37 /*---------------- Initialization ----------------*/ | |
38 /*--------------------------------------------------------------*/ | |
39 | |
40 /** | |
41 * Code entrance from the command line. | |
42 * @param args Command line arguments | |
43 */ | |
44 public static void main(String[] args){ | |
45 Timer t=new Timer(); | |
46 PrintTaxonomy x=new PrintTaxonomy(args); | |
47 x.process(t); | |
48 | |
49 //Close the print stream if it was redirected | |
50 Shared.closeStream(x.outstream); | |
51 } | |
52 | |
53 /** | |
54 * Constructor. | |
55 * @param args Command line arguments | |
56 */ | |
57 public PrintTaxonomy(String[] args){ | |
58 | |
59 {//Preparse block for help, config files, and outstream | |
60 PreParser pp=new PreParser(args, getClass(), false); | |
61 args=pp.args; | |
62 outstream=pp.outstream; | |
63 } | |
64 | |
65 //Set shared static variables | |
66 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
67 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
68 | |
69 //Create a parser object | |
70 Parser parser=new Parser(); | |
71 | |
72 int taxLevel=0, minLevel=0, maxLevel=TaxTree.LIFE; | |
73 | |
74 //Parse each argument | |
75 for(int i=0; i<args.length; i++){ | |
76 String arg=args[i]; | |
77 | |
78 //Break arguments into their constituent parts, in the form of "a=b" | |
79 String[] split=arg.split("="); | |
80 String a=split[0].toLowerCase(); | |
81 String b=split.length>1 ? split[1] : null; | |
82 | |
83 if(a.equals("out")){ | |
84 out1=b; | |
85 }else if(a.equals("counts")){ | |
86 countFile=b; | |
87 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
88 //do nothing | |
89 }else if(a.equals("verbose")){ | |
90 verbose=Parse.parseBoolean(b); | |
91 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){ | |
92 giTableFile=b; | |
93 }else if(a.equals("accession")){ | |
94 accessionFile=b; | |
95 }else if(a.equals("tree") || a.equals("taxtree")){ | |
96 taxTreeFile=b; | |
97 }else if(a.equals("level") || a.equals("lv") || a.equals("taxlevel") || a.equals("tl")){ | |
98 taxLevel=TaxTree.parseLevel(b); | |
99 }else if(a.equals("minlevel")){ | |
100 minLevel=TaxTree.parseLevel(b); | |
101 }else if(a.equals("maxlevel")){ | |
102 maxLevel=TaxTree.parseLevel(b); | |
103 }else if(a.equals("printname")){ | |
104 printName=Parse.parseBoolean(b); | |
105 }else if(a.equals("reverse")){ | |
106 reverseOrder=Parse.parseBoolean(b); | |
107 }else if(a.equals("silva")){ | |
108 TaxTree.SILVA_MODE=Parse.parseBoolean(b); | |
109 }else if(a.equals("unite")){ | |
110 TaxTree.UNITE_MODE=Parse.parseBoolean(b); | |
111 }else if(a.equals("simple")){ | |
112 skipNonCanonical=Parse.parseBoolean(b); | |
113 }else if(a.equals("column")){ | |
114 keyColumn=Integer.parseInt(b); | |
115 }else if(b!=null && (a.equals("name") || a.equals("names") || a.equals("id") || a.equals("ids"))){ | |
116 for(String s : b.split(",")){ | |
117 names.add(s); | |
118 } | |
119 }else{ | |
120 names.add(arg); | |
121 } | |
122 } | |
123 | |
124 if(taxTreeFile==null || "auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();} | |
125 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();} | |
126 if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();} | |
127 | |
128 taxLevelExtended=TaxTree.levelToExtended(taxLevel); | |
129 minLevelExtended=TaxTree.levelToExtended(minLevel); | |
130 maxLevelExtended=TaxTree.levelToExtended(maxLevel); | |
131 | |
132 {//Process parser fields | |
133 overwrite=ReadStats.overwrite=parser.overwrite; | |
134 append=ReadStats.append=parser.append; | |
135 | |
136 in1=parser.in1; | |
137 maxReads=parser.maxReads; | |
138 } | |
139 | |
140 //Ensure output files can be written | |
141 if(!Tools.testOutputFiles(overwrite, append, false, out1)){ | |
142 outstream.println((out1==null)+", "+out1); | |
143 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); | |
144 } | |
145 | |
146 //Create output FileFormat objects | |
147 ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false); | |
148 | |
149 ffcount=FileFormat.testOutput(countFile, FileFormat.TEXT, null, true, overwrite, append, false); | |
150 | |
151 //Create input FileFormat objects | |
152 ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, false); | |
153 | |
154 if(giTableFile!=null){ | |
155 outstream.println("Loading gi table."); | |
156 GiToTaxid.initialize(giTableFile); | |
157 } | |
158 if(accessionFile!=null){ | |
159 outstream.println("Loading accession table."); | |
160 AccessionToTaxid.load(accessionFile); | |
161 } | |
162 if(taxTreeFile!=null){ | |
163 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, true); | |
164 assert(tree.nameMap!=null); | |
165 }else{ | |
166 tree=null; | |
167 throw new RuntimeException("No tree specified."); | |
168 } | |
169 } | |
170 | |
171 /*--------------------------------------------------------------*/ | |
172 /*---------------- Outer Methods ----------------*/ | |
173 /*--------------------------------------------------------------*/ | |
174 | |
175 /** Create read streams and process all data */ | |
176 void process(Timer t){ | |
177 | |
178 TextStreamWriter tsw=null; | |
179 if(ffout1!=null){ | |
180 tsw=new TextStreamWriter(ffout1); | |
181 tsw.start(); | |
182 } | |
183 | |
184 if(ffin1!=null){ | |
185 if(ffin1.fasta() || ffin1.fastq() || ffin1.samOrBam() || ffin1.scarf()){ | |
186 processReads(tsw); | |
187 }else{ | |
188 processFile(new TextFile(ffin1), tsw); | |
189 } | |
190 }else{ | |
191 processNames(tsw); | |
192 } | |
193 | |
194 if(tsw!=null){errorState|=tsw.poisonAndWait();} | |
195 | |
196 if(ffcount!=null){ | |
197 TextStreamWriter tswc=new TextStreamWriter(ffcount); | |
198 tswc.start(); | |
199 for(TaxNode tn : tree.nodes){ | |
200 if(tn!=null && tn.countRaw>0){ | |
201 tswc.println(tn.countRaw+"\t"+tn.name); | |
202 } | |
203 } | |
204 errorState|=tswc.poisonAndWait(); | |
205 } | |
206 | |
207 t.stop(); | |
208 | |
209 //Throw an exception of there was an error in a thread | |
210 if(errorState){ | |
211 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
212 } | |
213 } | |
214 | |
215 /** Iterate through the names */ | |
216 void processNames(final TextStreamWriter tsw){ | |
217 for(String name : names){ | |
218 if(taxLevelExtended>0){ | |
219 printTaxLevel(name, tsw); | |
220 }else{ | |
221 printTaxonomy(name, tsw); | |
222 } | |
223 } | |
224 } | |
225 | |
226 /** Iterate through the names */ | |
227 void processFile(final TextFile tf, final TextStreamWriter tsw){ | |
228 for(String name=tf.nextLine(); name!=null; name=tf.nextLine()){ | |
229 | |
230 if(keyColumn>=0){ | |
231 String result=translateLine(name, keyColumn); | |
232 tsw.print(result); | |
233 }else if(taxLevelExtended>0){ | |
234 printTaxLevel(name, tsw); | |
235 }else{ | |
236 printTaxonomy(name, tsw); | |
237 } | |
238 } | |
239 } | |
240 | |
241 /** Iterate through the names */ | |
242 void processReads(final TextStreamWriter tsw){ | |
243 final ConcurrentReadInputStream cris; | |
244 { | |
245 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null); | |
246 if(verbose){System.err.println("Started cris");} | |
247 cris.start(); | |
248 } | |
249 | |
250 ListNum<Read> ln=cris.nextList(); | |
251 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
252 | |
253 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
254 | |
255 for(Read r1 : reads){ | |
256 if(keyColumn>=0){ | |
257 String result=translateLine(r1.id, keyColumn); | |
258 tsw.println(result); | |
259 }else if(taxLevelExtended>0){ | |
260 printTaxLevel(r1.id, tsw); | |
261 }else{ | |
262 printTaxonomy(r1.id, tsw); | |
263 } | |
264 } | |
265 cris.returnList(ln); | |
266 ln=cris.nextList(); | |
267 reads=(ln!=null ? ln.list : null); | |
268 } | |
269 cris.returnList(ln); | |
270 ReadWrite.closeStreams(cris); | |
271 } | |
272 | |
273 /*--------------------------------------------------------------*/ | |
274 /*---------------- Inner Methods ----------------*/ | |
275 /*--------------------------------------------------------------*/ | |
276 | |
277 String translateLine(String line, int col){ | |
278 StringBuilder sb=new StringBuilder(); | |
279 String[] split=line.split("\t"); | |
280 assert(split.length>col) : "Too few columns in line:\n"+line+"\n->\n"+Arrays.toString(split); | |
281 | |
282 if(col<split.length){ | |
283 String name=split[col]; | |
284 while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);} | |
285 | |
286 TaxNode tn=parseNodeFromHeader(name); | |
287 if(tn!=null){ | |
288 String tl=makeTaxLine(tn, minLevelExtended, maxLevelExtended).toString(); | |
289 split[col]=tl; | |
290 }else{ | |
291 List<TaxNode> list=tree.getNodesByNameExtended(name); | |
292 if(list!=null){ | |
293 String tab=""; | |
294 for(TaxNode tn2 : list){ | |
295 sb.append(tab); | |
296 sb.append(makeTaxLine(tn2, minLevelExtended, maxLevelExtended).toString()); | |
297 tab="\t"; | |
298 } | |
299 }else{ | |
300 split[col]=split[col]+"_***NOT_FOUND***"; | |
301 } | |
302 } | |
303 } | |
304 | |
305 for(int i=0; i<split.length; i++){ | |
306 if(i>0){sb.append('\t');} | |
307 sb.append(split[i]); | |
308 } | |
309 sb.append('\n'); | |
310 return sb.toString(); | |
311 } | |
312 | |
313 void printTaxonomy(String name, final TextStreamWriter tsw){ | |
314 while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);} | |
315 tsw.print("\n"); | |
316 if(printName){tsw.print(name+":\n");} | |
317 TaxNode tn=parseNodeFromHeader(name); | |
318 if(tn!=null){ | |
319 printTaxonomy(tn, tsw); | |
320 return; | |
321 }else{ | |
322 List<TaxNode> list=tree.getNodesByNameExtended(name); | |
323 if(list!=null){ | |
324 String nl=""; | |
325 for(TaxNode tn2 : list){ | |
326 tsw.print(nl); | |
327 printTaxonomy(tn2, tsw); | |
328 nl="\n"; | |
329 } | |
330 return; | |
331 } | |
332 } | |
333 tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'")); | |
334 return; | |
335 } | |
336 | |
337 void printTaxLevel(String name, final TextStreamWriter tsw){ | |
338 while(name.startsWith(">") || name.startsWith("@")){name=name.substring(1);} | |
339 tsw.print("\n"); | |
340 if(printName){tsw.print(name+":\n");} | |
341 TaxNode tn=parseNodeFromHeader(name); | |
342 if(tn!=null){ | |
343 printTaxLevel(tn, tsw); | |
344 return; | |
345 }else{ | |
346 List<TaxNode> list=tree.getNodesByNameExtended(name); | |
347 if(list!=null){ | |
348 for(TaxNode tn2 : list){ | |
349 printTaxLevel(tn2, tsw); | |
350 } | |
351 return; | |
352 } | |
353 } | |
354 tsw.println("Could not find node" + (printName ? "." : " for '"+name+"'")); | |
355 return; | |
356 } | |
357 | |
358 // void printTaxCounts(String name, final TextStreamWriter tsw){ | |
359 // TaxNode tn=null; | |
360 // tn=tree.getNode(name); | |
361 // if(tn==null){tn=tree.getNodeByName(name);} | |
362 // if(tn==null){tn=unknown;} | |
363 // while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);} | |
364 // if(tsw!=null)tsw.println(tn.name); | |
365 // tn.incrementRaw(1); | |
366 // } | |
367 | |
368 void printTaxonomy(TaxNode tn, final TextStreamWriter tsw){ | |
369 // assert(false) : tn.levelExtended+", "+taxLevelExtended+", "+minLevelExtended+", "+maxLevelExtended; | |
370 assert(tn!=null); | |
371 // tsw.print("\n"); | |
372 do{ | |
373 if(tn.levelExtended<=taxLevelExtended){tn.incrementRaw(1);} | |
374 if(tn.levelExtended>=minLevelExtended && tn.levelExtended<=maxLevelExtended){ | |
375 if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ | |
376 tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name); | |
377 } | |
378 } | |
379 tn=tree.getNode(tn.pid); | |
380 }while(tn!=null && tn.id!=tn.pid); | |
381 } | |
382 | |
383 StringBuilder makeTaxLine(TaxNode tn, int minLevelE, int maxLevelE){ | |
384 // assert(false) : tn+", "+minLevelE+", "+maxLevelE; | |
385 assert(tn!=null); | |
386 StringBuilder sb=new StringBuilder(); | |
387 | |
388 if(reverseOrder){ | |
389 ArrayList<TaxNode> list=new ArrayList<TaxNode>(); | |
390 while(tn.levelExtended<=maxLevelE){ | |
391 if(tn.levelExtended>=minLevelE){ | |
392 if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ | |
393 list.add(tn); | |
394 } | |
395 } | |
396 if(tn.id==tn.pid){break;} | |
397 tn=tree.getNode(tn.pid); | |
398 } | |
399 | |
400 String semi=""; | |
401 Collections.reverse(list); | |
402 for(TaxNode tn2 : list){ | |
403 sb.append(semi); | |
404 sb.append(tn2.levelToStringShort()); | |
405 sb.append("__"); | |
406 sb.append(tn2.name); | |
407 semi=";"; | |
408 } | |
409 }else{ | |
410 String semi=""; | |
411 while(tn.levelExtended<=maxLevelE){ | |
412 if(tn.levelExtended>=minLevelE && !tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ | |
413 sb.append(semi); | |
414 sb.append(tn.levelToStringShort()); | |
415 sb.append("__"); | |
416 sb.append(tn.name); | |
417 semi=";"; | |
418 } | |
419 if(tn.id==tn.pid){break;} | |
420 tn=tree.getNode(tn.pid); | |
421 } | |
422 } | |
423 | |
424 return sb; | |
425 } | |
426 | |
427 // public static void printTaxonomy(TaxNode tn, final StringBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){ | |
428 // final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel); | |
429 // assert(tn!=null); | |
430 //// tsw.print("\n"); | |
431 // do{ | |
432 // if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ | |
433 // sb.append(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name+"\n"); | |
434 // } | |
435 // tn=tree.getNode(tn.pid); | |
436 // }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE); | |
437 // } | |
438 | |
439 public static void printTaxonomy(TaxNode tn, final ByteBuilder sb, final TaxTree tree, final int maxLevel, boolean skipNonCanonical){ | |
440 final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel); | |
441 assert(tn!=null); | |
442 // tsw.print("\n"); | |
443 do{ | |
444 if(!tn.cellularOrganisms() && (!skipNonCanonical || tn.isSimple())){ | |
445 sb.append(tn.levelStringExtended(false)).append('\t').append(tn.id).append('\t').append(tn.name).append('\n'); | |
446 } | |
447 tn=tree.getNode(tn.pid); | |
448 }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE); | |
449 } | |
450 | |
451 // public static void printTaxonomy(TaxNode tn, final TextStreamWriter tsw, final TaxTree tree, final int maxLevel){ | |
452 // final int maxLevelE=maxLevel<0 ? maxLevel : TaxTree.levelToExtended(maxLevel); | |
453 // assert(tn!=null); | |
454 //// tsw.print("\n"); | |
455 // do{ | |
456 // if(!skipNonCanonical || tn.isSimple()){ | |
457 // tsw.println(tn.levelStringExtended(false)+"\t"+tn.id+"\t"+tn.name); | |
458 // } | |
459 // tn=tree.getNode(tn.pid); | |
460 // }while(tn!=null && tn.id!=tn.pid && tn.levelExtended<=maxLevelE); | |
461 // } | |
462 | |
463 void printTaxLevel(TaxNode tn, final TextStreamWriter tsw){ | |
464 if(tn==null){tn=unknown;} | |
465 while(tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){tn=tree.getNode(tn.pid);} | |
466 if(tsw!=null){tsw.println(tn.name);} | |
467 tn.incrementRaw(1); | |
468 } | |
469 | |
470 // void printTaxCounts(TaxNode tn, final TextStreamWriter tsw){ | |
471 // if(tn==null){tn=unknown;} | |
472 // while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);} | |
473 // if(tsw!=null)tsw.println(tn.name); | |
474 // tn.incrementRaw(1); | |
475 // } | |
476 | |
477 public TaxNode parseNodeFromHeader(String header){ | |
478 if(tree==null){return null;} | |
479 return tree.parseNodeFromHeader(header, true); | |
480 } | |
481 | |
482 /*--------------------------------------------------------------*/ | |
483 /*---------------- Fields ----------------*/ | |
484 /*--------------------------------------------------------------*/ | |
485 | |
486 /** Optional input file path */ | |
487 private String in1=null; | |
488 | |
489 /** Primary output file path */ | |
490 private String out1="stdout.txt"; | |
491 | |
492 private String countFile=null; | |
493 | |
494 private String giTableFile=null; | |
495 private String taxTreeFile=null; | |
496 private String accessionFile=null; | |
497 | |
498 private final TaxTree tree; | |
499 | |
500 // /** Level to print */ | |
501 // private int taxLevel=-1;//TaxTree.stringToLevel("phylum"); | |
502 // | |
503 // /** Min level to print */ | |
504 // private int minLevel=-1; | |
505 // | |
506 // /** Max level to print */ | |
507 // private int maxLevel=TaxTree.stringToLevel("life"); | |
508 | |
509 private final int taxLevelExtended, minLevelExtended, maxLevelExtended; | |
510 | |
511 /** Reverse order for tax lines */ | |
512 private boolean reverseOrder=true; | |
513 | |
514 private ArrayList<String> names=new ArrayList<String>(); | |
515 | |
516 private long maxReads=-1; | |
517 | |
518 boolean printName=true; | |
519 boolean skipNonCanonical=false; | |
520 | |
521 int keyColumn=-1; | |
522 // Deprecated. Description from shellscript: | |
523 // column=-1 If set to a non-negative integer, parse the taxonomy | |
524 // information from this column in a tab-delimited file. | |
525 // Example if column=1: | |
526 // read1 TAB gi|944259871|gb|KQL24128.1| TAB score:42 | |
527 // becomes | |
528 // read1 TAB k__Viridiplantae;p__Streptophyta;... TAB score:42 | |
529 | |
530 /*--------------------------------------------------------------*/ | |
531 /*---------------- Final Fields ----------------*/ | |
532 /*--------------------------------------------------------------*/ | |
533 | |
534 /** Optional input file */ | |
535 private final FileFormat ffin1; | |
536 | |
537 /** Primary output file */ | |
538 private final FileFormat ffout1; | |
539 | |
540 private final FileFormat ffcount; | |
541 | |
542 private final TaxNode unknown=new TaxNode(-99, -99, TaxTree.LIFE, TaxTree.LIFE_E, "UNKNOWN"); | |
543 | |
544 /*--------------------------------------------------------------*/ | |
545 /*---------------- Common Fields ----------------*/ | |
546 /*--------------------------------------------------------------*/ | |
547 | |
548 /** Print status messages to this output stream */ | |
549 private PrintStream outstream=System.err; | |
550 /** Print verbose messages */ | |
551 public static boolean verbose=false; | |
552 /** True if an error was encountered */ | |
553 public boolean errorState=false; | |
554 /** Overwrite existing output files */ | |
555 private boolean overwrite=false; | |
556 /** Append to existing output files */ | |
557 private boolean append=false; | |
558 | |
559 } |