comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/TaxTree.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package tax;
2
3 import java.io.PrintStream;
4 import java.io.Serializable;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.LinkedHashMap;
9 import java.util.List;
10 import java.util.regex.Pattern;
11
12 import fileIO.ByteFile;
13 import fileIO.ReadWrite;
14 import fileIO.TextFile;
15 import shared.Parse;
16 import shared.Parser;
17 import shared.PreParser;
18 import shared.Shared;
19 import shared.Timer;
20 import shared.Tools;
21 import structures.ByteBuilder;
22 import structures.IntHashMap;
23 import structures.IntList;
24 import structures.IntLongHashMap;
25
26 /**
27 * Represents a taxonomic tree.
28 * Usually just one of these needs to be created for a process.
29 * Designed for NCBI's taxdmp.zip file contents.
30 * @author Brian Bushnell
31 * @date Mar 6, 2015
32 *
33 */
34 public class TaxTree implements Serializable{
35
36 /**
37 *
38 */
39 private static final long serialVersionUID = 5894416521711540017L;
40
41 /*--------------------------------------------------------------*/
42 /*---------------- Main ----------------*/
43 /*--------------------------------------------------------------*/
44
45 /**
46 * Code entrance from the command line.
47 * This is not called normally, only when converting NCBI text files
48 * into a binary representation and writing it to disk.
49 * @param args Command line arguments
50 */
51 public static void main(String[] args){
52
53 {//Preparse block for help, config files, and outstream
54 PreParser pp=new PreParser(args, outstream, null, false);
55 args=pp.args;
56 outstream=pp.outstream;
57 }
58
59 assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n";
60 ReadWrite.USE_UNPIGZ=true;
61 ReadWrite.USE_PIGZ=true;
62 ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9);
63 ReadWrite.PIGZ_BLOCKSIZE=256;
64 ReadWrite.PIGZ_ITERATIONS=60;
65
66 Timer t=new Timer();
67 TaxTree tree=new TaxTree(args);
68 t.stop();
69
70 outstream.println("Retained "+tree.nodeCount+" nodes:");
71
72 for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){
73 outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]);
74 if(verbose){
75 int lim=10;
76 for(int j=0; j<lim && j<tree.treeLevelsExtended[i].length; j++){
77 TaxNode n=tree.treeLevelsExtended[i][j];
78 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]);
79 }
80 for(int j=tree.treeLevelsExtended[i].length-lim; j<tree.treeLevelsExtended[i].length; j++){
81 if(j>=lim){
82 TaxNode n=tree.treeLevelsExtended[i][j];
83 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]);
84 }
85 }
86 }
87 outstream.println();
88 }
89
90
91 outstream.println();
92 outstream.println("Time: \t"+t);
93
94 if(args.length>2){//Write a tree
95 ReadWrite.write(tree, args[3], true);
96 }
97 }
98
99 /** Parse arguments from the command line */
100 private Parser parse(String[] args){
101
102 //Create a parser object
103 Parser parser=new Parser();
104
105 //Set any necessary Parser defaults here
106 //parser.foo=bar;
107
108 //Parse each argument
109 for(int i=0; i<args.length; i++){
110 String arg=args[i];
111
112 //Break arguments into their constituent parts, in the form of "a=b"
113 String[] split=arg.split("=");
114 String a=split[0].toLowerCase();
115 String b=split.length>1 ? split[1] : null;
116 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
117
118 if(a.equals("verbose")){
119 verbose=Parse.parseBoolean(b);
120 }else if(a.equals("parse_flag_goes_here")){
121 long fake_variable=Parse.parseKMG(b);
122 //Set a variable here
123 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
124 //do nothing
125 }else if(i>3){
126 outstream.println("Unknown parameter "+args[i]);
127 assert(false) : "Unknown parameter "+args[i];
128 }
129 }
130
131 return parser;
132 }
133
134 /*--------------------------------------------------------------*/
135 /*---------------- Initialization ----------------*/
136 /*--------------------------------------------------------------*/
137
138 /*--------------------------------------------------------------*/
139 /*---------------- Constructors ----------------*/
140 /*--------------------------------------------------------------*/
141
142 /**
143 * Constructor using filenames from command line arguments, in the format of:
144 * {names, nodes, merged}
145 * @param args Command line arguments
146 */
147 private TaxTree(String[] args){
148 this(args[0], args[1], args[2], args);
149 }
150
151 /**
152 * @param namesFile NCBI names.txt
153 * @param nodesFile NCBI nodes.txt
154 * @param mergedFile NCBI merged.txt
155 * @param args
156 */
157 private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){
158
159 if(args!=null) {
160 Parser parser=parse(args);
161 }
162
163 nodes=getNames(namesFile);
164 getNodes(nodesFile, nodes);
165
166 mergedMap=getMerged(mergedFile);
167
168 countChildren();
169 outstream.println("Counted children.");
170 int rounds=percolate();
171 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
172
173 if(assignStrains){
174 assignStrains();
175 rounds=percolate();
176 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
177 }
178
179 if(simplify){
180 if(verbose){outstream.println("Simplifying.");}
181 int removed=simplify(nodes);
182 if(verbose){outstream.println("Removed "+removed+" nodes.");}
183 rounds=percolate();
184 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
185 }
186 int errors=test(nodes);
187 // assert(errors==0); //Not possible since the tree is wrong.
188 if(errors>0) {
189 System.err.println("Found "+errors+" errors in tree.");
190 }
191
192 for(TaxNode n : nodes){
193 if(n!=null){
194 nodesPerLevel[n.level]++;
195 nodesPerLevelExtended[n.levelExtended]++;
196 }
197 }
198
199 // for(int i=0; i<nodesPerLevel.length; i++){
200 // treeLevels[i]=new TaxNode[nodesPerLevel[i]];
201 // }
202 for(int i=0; i<nodesPerLevelExtended.length; i++){
203 treeLevelsExtended[i]=new TaxNode[nodesPerLevelExtended[i]];
204 }
205
206 // {
207 // int[] temp=new int[nodesPerLevel.length];
208 // for(TaxNode n : nodes){
209 // if(n!=null){
210 // int level=n.level;
211 // treeLevels[level][temp[level]]=n;
212 // temp[level]++;
213 // }
214 // }
215 // }
216
217 {
218 int[] temp=new int[nodesPerLevelExtended.length];
219 for(TaxNode n : nodes){
220 if(n!=null){
221 int level=n.levelExtended;
222 treeLevelsExtended[level][temp[level]]=n;
223 temp[level]++;
224 }
225 }
226 }
227 nodeCount=(int)Tools.sum(nodesPerLevelExtended);
228
229 }
230
231 /*--------------------------------------------------------------*/
232 /*--------------- Loaders ----------------*/
233 /*--------------------------------------------------------------*/
234
235
236 /**
237 * Load a tax tree from disk.
238 * @param taxTreeFile Serialized TaxTree.
239 * @param hashNames Hash nodes using names as keys
240 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens
241 * @return
242 */
243 public static final TaxTree loadTaxTree(String taxTreeFile, PrintStream outstream, boolean hashNames,
244 boolean hashDotFormat){
245 if(taxTreeFile==null){return null;}
246 return loadTaxTree(taxTreeFile, null, null, null, outstream, hashNames, hashDotFormat);
247 }
248
249 /**
250 * Load a tax tree from disk, either from a binary tree file,
251 * or from NCBI text files.
252 * @param taxTreeFile Binary representation; mutually exclusive with other files.
253 * @param taxNameFile NCBI names.txt
254 * @param taxNodeFile NCBI nodes.txt
255 * @param taxMergedFile NCBI merged.txt
256 * @param hashNames Hash nodes using names as keys
257 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens
258 * @return The loaded tree
259 */
260 public static final TaxTree loadTaxTree(String taxTreeFile, String taxNameFile, String taxNodeFile,
261 String taxMergedFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){
262 if(taxTreeFile!=null || taxNodeFile==null){
263 TaxTree tree=sharedTree(taxTreeFile, hashNames, hashDotFormat, outstream);
264 if(tree!=null){return tree;}
265 }
266 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=defaultTreeFile();}
267 assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files.";
268 Timer t=new Timer();
269 if(outstream!=null){outstream.print("\nLoading tax tree; ");}
270 final TaxTree tree;
271 if(taxTreeFile!=null){
272 tree=ReadWrite.read(TaxTree.class, taxTreeFile, true);
273 }else{
274 tree=new TaxTree(taxNameFile, taxNodeFile, taxMergedFile, null);
275 }
276 t.stop();
277 if(hashNames){
278 if(outstream!=null){outstream.println("Hashing names.");}
279 tree.hashNames(hashDotFormat);
280 }
281 if(outstream!=null){
282 outstream.println("Time: \t"+t);
283 Shared.printMemory();
284 outstream.println();
285 }
286 if(ALLOW_SHARED_TREE){sharedTree=tree;}
287 return tree;
288 }
289
290 /*--------------------------------------------------------------*/
291 /*--------------- Constructor Helpers ----------------*/
292 /*--------------------------------------------------------------*/
293
294 /**
295 * Finds unranked nodes in the archaeal and bacterial kingdoms.
296 * If these are below species level, have a ranked parent,
297 * and have no ranked children, they are assigned strain or substrain.
298 */
299 private void assignStrains(){
300
301 outstream.println("Assigning strains.");
302 int strains=0, substrains=0;
303 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed yet
304 TaxNode archaea=getNode(ARCHAEA_ID);
305 assert(bacteria.name.equalsIgnoreCase("Bacteria"));
306 assert(archaea.name.equalsIgnoreCase("Archaea"));
307
308 ArrayList<TaxNode> bactList=new ArrayList<TaxNode>();
309 ArrayList<TaxNode> archList=new ArrayList<TaxNode>();
310 for(TaxNode tn : nodes){
311 if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){
312 if(descendsFrom(tn, bacteria)){
313 bactList.add(tn);
314 }else if(descendsFrom(tn, archaea)){
315 archList.add(tn);
316 }
317 }
318 }
319
320 ArrayList<TaxNode> prokList=new ArrayList<TaxNode>(bactList.size()+archList.size());
321 prokList.addAll(bactList);
322 prokList.addAll(archList);
323
324 for(TaxNode tn : prokList){
325 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){
326 TaxNode parent=nodes[tn.pid];
327 if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){
328 tn.levelExtended=STRAIN_E;
329 tn.level=SUBSPECIES;
330 tn.setOriginalLevel(STRAIN_E);
331 strains++;
332 }
333 }
334 }
335
336 // outstream.println("Assigned "+strains+" strains.");
337 for(TaxNode tn : prokList){
338 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){
339 TaxNode parent=nodes[tn.pid];
340 if(parent.levelExtended==STRAIN_E){
341 tn.levelExtended=SUBSTRAIN_E;
342 tn.level=SUBSPECIES;
343 tn.setOriginalLevel(SUBSTRAIN_E);
344 substrains++;
345 }
346 }
347 }
348 // outstream.println("Assigned "+substrains+" substrains.");
349 }
350
351 @Deprecated
352 private void assignStrainsOld(){
353
354 outstream.println("Assigning strains.");
355 int strains=0, substrains=0;
356 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed
357 assert(bacteria.name.equalsIgnoreCase("Bacteria"));
358 for(TaxNode tn : nodes){
359 if(tn!=null && tn.originalLevel()==NO_RANK){
360 TaxNode parent=nodes[tn.pid];
361 if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){
362 // nodesPerLevelExtended[STRAIN_E]++;
363 // nodesPerLevelExtended[tn.levelExtended]--;
364 tn.levelExtended=STRAIN_E;
365 tn.level=SUBSPECIES;
366 tn.setOriginalLevel(STRAIN_E);
367 strains++;
368 }
369 }
370 }
371 // outstream.println("Assigned "+strains+" strains.");
372 for(TaxNode tn : nodes){
373 if(tn!=null && tn.originalLevel()==NO_RANK){
374 TaxNode parent=nodes[tn.pid];
375 if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){
376 // nodesPerLevelExtended[SUBSTRAIN_E]++;
377 // nodesPerLevelExtended[tn.levelExtended]--;
378 tn.levelExtended=SUBSTRAIN_E;
379 tn.level=SUBSPECIES;
380 tn.setOriginalLevel(SUBSTRAIN_E);
381 substrains++;
382 }
383 }
384 }
385 // outstream.println("Assigned "+substrains+" substrains.");
386 }
387
388 /*--------------------------------------------------------------*/
389 /*---------------- Construction ----------------*/
390 /*--------------------------------------------------------------*/
391
392 /**
393 * Create tax nodes using names in the designated file.
394 * @param fname NCBI names.txt
395 * @return Array of created nodes, where array[x] contains the node with TaxID x.
396 */
397 private static TaxNode[] getNames(String fname){
398 ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000);
399 int max=0;
400
401 TextFile tf=new TextFile(fname, false);
402 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
403 if(s.contains("scientific name")){
404 String[] split=delimiter.split(s, 3);
405 assert(split.length==3) : s;
406 int id=Integer.parseInt(split[0]);
407 String name=split[1];
408 if(id==1 && name.equalsIgnoreCase("root")){name="Life";}
409 max=Tools.max(max, id);
410 list.add(new TaxNode(id, name));
411 }
412 }
413
414 TaxNode[] nodes=new TaxNode[max+1];
415 for(TaxNode n : list){
416 assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n;
417 nodes[n.id]=n;
418 }
419
420 return nodes;
421 }
422
423 /**
424 * Parses names file a second time to fill in additional information.
425 * Should really be merged into getNames.
426 * @TODO Merge into getNames
427 */
428 private static TaxNode[] getNodes(String fname, TaxNode[] nodes){
429
430 int max=0;
431
432 LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>();
433
434 TextFile tf=new TextFile(fname, false);
435 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
436 String[] split=delimiter.split(s, 4);
437 assert(split.length==4) : s;
438 int id=-1, pid=-1, level=-1, levelExtended=-1;
439
440 id=Integer.parseInt(split[0]);
441 try {
442 pid=Integer.parseInt(split[1]);
443 } catch (NumberFormatException e) {
444 // TODO Auto-generated catch block
445 e.printStackTrace();
446 System.err.println("Bad line: "+s+"\n"+Arrays.toString(split));
447 }
448 boolean alt=false;
449 {
450 String key=split[2];
451 Integer obj0=levelMap.get(key);
452 Integer obj=levelMapExtended.get(key);
453 assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split);
454
455 if(obj0==null){
456 obj0=altLevelMap.get(key);
457 alt=true;
458 }
459 if(obj0!=null){
460 level=obj0;
461 levelExtended=obj;
462 if(id==pid){
463 level=LIFE;
464 levelExtended=LIFE_E;
465 alt=false;
466 }
467 }else{
468 if(id==pid){
469 level=LIFE;
470 levelExtended=LIFE_E;
471 alt=false;
472 }else{
473 int[] count=oddNames.get(key);
474 if(count==null){
475 count=new int[1];
476 oddNames.put(key, count);
477 }
478 count[0]++;
479 }
480 }
481 }
482 max=Tools.max(max, id);
483 TaxNode n=nodes[id];
484 assert(n!=null && n.pid<0) : n+" -> "+s;
485 n.pid=pid;
486 n.level=level;
487 n.levelExtended=levelExtended;
488 n.setOriginalLevel(levelExtended);
489 n.setCanonical(!alt);
490 assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n";
491 }
492
493 if(oddNames.size()>0){
494 outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:");
495 if(verbose){
496 for(String s : oddNames.keySet()){
497 outstream.println(oddNames.get(s)[0]+"\t"+s);
498 }
499 }
500 }
501
502 return nodes;
503 }
504
505 /**
506 * Count child nodes of each node.
507 * This can be used to size arrays or determine which nodes are leaves.
508 */
509 private void countChildren(){
510 for(TaxNode child : nodes){
511 if(child!=null && child.pid!=child.id){
512 TaxNode parent=getNode(child.pid);
513 if(parent!=child){parent.numChildren++;}
514 }
515 }
516 }
517
518 //TODO - This could be finished in 2 passes using the childTable.
519 /**
520 * Fill derived fields minParentLevelExtended and maxChildLevelExtended
521 * by percolating information through the tree until a fixpoint is reached.
522 * @TODO This could be finished in 2 passes using the childTable.
523 * @return Number of rounds required to reach fixpoint.
524 */
525 private int percolate(){
526 boolean changed=true;
527 int rounds=0;
528 while(changed){
529 changed=false;
530 rounds++;
531 for(TaxNode child : nodes){
532 if(child!=null && child.pid!=child.id){
533 TaxNode parent=getNode(child.pid);
534 changed=(child.discussWithParent(parent) | changed);
535 }
536 }
537
538 if(!changed){break;}
539 changed=false;
540 rounds++;
541 for(int i=nodes.length-1; i>=0; i--){
542 TaxNode child=nodes[i];
543 if(child!=null && child.pid!=child.id){
544 TaxNode parent=getNode(child.pid);
545 changed=(child.discussWithParent(parent) | changed);
546 }
547 }
548 }
549 return rounds;
550 }
551
552 /**
553 * Load nodes into the nameMap and nameMapLower, mapped to their names.
554 * @param genusDotSpecies Also hash abbreviations such as E.coli.
555 */
556 public synchronized void hashNames(boolean genusDotSpecies){
557 if(nameMap!=null){return;}
558 assert(nameMap==null);
559 assert(nameMapLower==null);
560 final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN));
561 nameMap=new HashMap<String, ArrayList<TaxNode>>(size);
562 nameMapLower=new HashMap<String, ArrayList<TaxNode>>(size);
563
564 //Hash the names, both lowercase and uppercase
565 for(TaxNode n : nodes){
566 if(n!=null){
567 String name=n.name;
568 if(name.indexOf('_')>=0){
569 name=name.replace('_', ' ').trim();
570 }
571 if(name!=null && !name.equals("environmental samples")){
572 {
573 ArrayList<TaxNode> list=nameMap.get(name);
574 if(list==null){
575 list=new ArrayList<TaxNode>();
576 nameMap.put(name, list);
577 }
578 list.add(n);
579 }
580 {
581 String lc=name.toLowerCase();
582 ArrayList<TaxNode> list=nameMapLower.get(lc);
583 if(list==null){
584 list=new ArrayList<TaxNode>();
585 nameMapLower.put(lc, list);
586 }
587 list.add(n);
588 }
589 }
590 }
591 }
592
593 //Hash G.species versions of the names, both lowercase and uppercase
594 if(genusDotSpecies){
595 ByteBuilder bb=new ByteBuilder(64);
596 for(TaxNode n : nodes){
597 if(n!=null && n.levelExtended==SPECIES_E){
598 String name=n.name;
599 if(name.indexOf('_')>=0){
600 name=name.replace('_', ' ').trim();
601 }
602 if(name!=null && !name.equals("environmental samples")){
603 final String dotFormat=dotFormat(name, bb);
604 if(dotFormat!=null){
605 {
606 ArrayList<TaxNode> list=nameMap.get(dotFormat);
607 if(list==null){
608 list=new ArrayList<TaxNode>();
609 nameMap.put(dotFormat, list);
610 }
611 list.add(n);
612 }
613 {
614 String lc=dotFormat.toLowerCase();
615 ArrayList<TaxNode> list=nameMapLower.get(lc);
616 if(list==null){
617 list=new ArrayList<TaxNode>();
618 nameMapLower.put(lc, list);
619 }
620 list.add(n);
621 }
622 }
623 }
624 }
625 }
626 }
627 }
628
629 /**
630 * Generate the "dot format" name of a node.
631 * For example, transform "Homo sapiens" to "H.sapiens"
632 * @param name Node name
633 * @param buffer A ByteBuilder that may be modified
634 * @return Dot format
635 */
636 private static String dotFormat(String name, ByteBuilder buffer){
637 if(name==null || name.indexOf('.')>=0){return null;}
638 final int firstSpace=name.indexOf(' ');
639 if(firstSpace<0 || firstSpace>=name.length()-1){return null;}
640 final int lastSpace=name.lastIndexOf(' ');
641 if(firstSpace!=lastSpace){return null;}
642 final String a=name.substring(0, firstSpace);
643 final String b=name.substring(lastSpace+1);
644 final char ca=a.charAt(0);
645 final char cb=b.charAt(0);
646 if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;}
647 if(buffer==null){buffer=new ByteBuilder(2+b.length());}
648 else{buffer.clear();}
649 buffer.append(ca).append('.').append(b);
650 return buffer.toString();
651 }
652
653 /**
654 * Fill childMap, which maps nodes to their children.
655 */
656 public synchronized void hashChildren(){
657 assert(childMap==null);
658 int nodesWithChildren=0;
659 for(TaxNode tn : nodes){
660 if(tn!=null && tn.numChildren>0){nodesWithChildren++;}
661 }
662 childMap=new HashMap<TaxNode, ArrayList<TaxNode>>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN));
663 for(TaxNode tn : nodes){
664 if(tn!=null){
665 if(tn.numChildren>0){
666 childMap.put(tn, new ArrayList<TaxNode>(tn.numChildren));
667 }
668 }
669 }
670 for(TaxNode tn : nodes){
671 if(tn!=null){
672 if(tn.id!=tn.pid){
673 ArrayList<TaxNode> list=childMap.get(getNode(tn.pid));
674 if(list!=null){list.add(tn);}
675 }
676 }
677 }
678 }
679
680 /**
681 * Fetch this node's children.
682 * @param parent Node in question
683 * @return List of child nodes
684 */
685 public ArrayList<TaxNode> getChildren(TaxNode parent){
686 if(parent.numChildren<1){return null;}
687 if(childMap!=null){return childMap.get(parent);}
688 ArrayList<TaxNode> list=new ArrayList<TaxNode>(parent.numChildren);
689 for(TaxNode tn : nodes){
690 if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){
691 list.add(tn);
692 }
693 }
694 return list;
695 }
696
697 /**
698 * Load a map of old to new TaxIDs.
699 * @param mergedFile NCBI merged.txt.
700 * @return Map of old to new TaxIDs
701 */
702 private static IntHashMap getMerged(String mergedFile) {
703 if(mergedFile==null){return null;}
704 String[] lines=TextFile.toStringLines(mergedFile);
705 if(lines.length<1){return null;}
706 IntHashMap map=new IntHashMap((int)(lines.length*1.3));
707 for(String line : lines){
708 String[] split=delimiterTab.split(line);
709 int a=Integer.parseInt(split[0]);
710 int b=Integer.parseInt(split[2]);
711 map.put(a, b);
712 }
713 return map;
714 }
715
716 /**
717 * Simplify the tree by assigning ranks to unranked nodes,
718 * where possible, through inference.
719 * Optionally removes unranked nodes based on the skipNorank field.
720 * @param nodes Array of all TaxNodes.
721 * @return Number of nodes removed.
722 */
723 private int simplify(TaxNode nodes[]){
724
725 int failed=test(nodes);
726
727 int removed=0;
728 int reassigned=0;
729
730 if(reassign){
731 boolean changed=true;
732 int changedCount=0;
733 while(changed){
734 changed=false;
735 for(int i=0; i<nodes.length; i++){
736 TaxNode n=nodes[i];
737 if(n!=null && n.levelExtended<1){
738 int pid=n.pid;
739 TaxNode parent=nodes[pid];
740 assert(parent!=null) : n;
741 if(n.maxDescendantLevelIncludingSelf()<SUBSPECIES_E &&
742 parent.minAncestorLevelIncludingSelf()<=SPECIES_E && parent.minAncestorLevelIncludingSelf()>=SUBSPECIES_E){
743 // if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){
744 changed=true;
745 n.levelExtended=SUBSPECIES_E;
746 n.level=SUBSPECIES;
747 changedCount++;
748 }
749 }
750 }
751 }
752 System.err.println("Assigned levels to "+changedCount+" unranked nodes.");
753 }
754
755
756 if(skipNorank){//Skip nodes with unknown taxa
757 if(verbose){outstream.println("A0");}
758
759 for(int i=0; i<nodes.length; i++){
760 TaxNode n=nodes[i];
761 if(n!=null){
762 int pid=n.pid;
763 TaxNode parent=nodes[pid];
764 assert(parent!=null) : n;
765 assert(parent!=n || pid==1) : n+", "+pid;
766 while(parent.levelExtended<1 && n.levelExtended>parent.levelExtended){
767 //System.err.println("Reassigned from "+parent);
768 assert(parent.id!=parent.pid);
769 parent=nodes[parent.pid];
770 n.pid=parent.id;
771 reassigned++;
772 }
773 }
774 }
775
776 for(int i=0; i<nodes.length; i++){
777 if(nodes[i]!=null && nodes[i].levelExtended<0){
778 System.err.println("Removed "+nodes[i]);
779 nodes[i]=null;
780 removed++;
781 }
782 }
783 if(verbose){outstream.println("Skipped "+reassigned+" unranked parents, removed "+removed+" invalid nodes.");}
784 }
785
786 if(inferRankLimit>0){//Infer level for unset nodes (from "no rank")
787 if(verbose){outstream.println("A");}
788 int changed=1;
789 while(changed>0){
790 changed=0;
791 for(final TaxNode n : nodes){
792 if(n!=null){
793 if(n.levelExtended==0){
794 TaxNode parent=nodes[n.pid];
795 if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){
796 n.levelExtended=Tools.max(1, parent.levelExtended-1);
797 assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit);
798 changed++;
799 }
800 }
801 }
802 }
803 if(verbose){outstream.println("changed: "+changed);}
804 }
805
806 // outstream.println("B");
807 // for(TaxNode n : nodes){
808 // if(n!=null && n.level==0){
809 // n.level=-1;
810 // }
811 // }
812 }
813
814 failed=test(nodes);
815
816 // if(reassign){//Skip nodes with duplicate taxa
817 // if(verbose){outstream.println("D");}
818 // int changed=1;
819 // while(changed>0){
820 // changed=0;
821 // for(final TaxNode n : nodes){
822 // if(n!=null){
823 // TaxNode parent=nodes[n.pid];
824 // TaxNode grandparent=nodes[parent.pid];
825 // assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent;
826 // assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent;
827 //
828 // while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) ||
829 // n.level>parent.level || (n.level==parent.level))){
830 // parent=grandparent;
831 // grandparent=nodes[parent.pid];
832 // n.pid=parent.id;
833 // reassigned++;
834 // changed++;
835 // }
836 // }
837 // }
838 // if(verbose){outstream.println("changed: "+changed);}
839 // }
840 // if(verbose){outstream.println("E");}
841 // for(int i=0; i<nodes.length; i++){
842 // if(nodes[i]!=null && nodes[i].level<0){
843 // nodes[i]=null;
844 // removed++;
845 // }
846 // }
847 // }
848
849 failed=test(nodes);
850
851 if(verbose){outstream.println("F");}
852 {//Ensure the tree is now clean
853 for(int i=0; i<nodes.length; i++){
854 TaxNode n=nodes[i];
855 if(n!=null){
856 TaxNode parent=nodes[n.pid];
857 TaxNode grandparent=nodes[parent.pid];
858 assert(n==parent || n.levelExtended<=parent.levelExtended || !n.canonical() || n.levelExtended<1 || parent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent;
859 assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent;
860 }
861 }
862 }
863
864 // if(verbose){System.err.println("Reassignments: "+reassigned);}
865
866 return removed;
867 }
868
869 /*--------------------------------------------------------------*/
870 /*---------------- Validation ----------------*/
871 /*--------------------------------------------------------------*/
872
873 /**
874 * Ensure tree has monotonically increasing (or nondescending) ranks.
875 * @param nodes All TaxNodes.
876 * @return Number of violations.
877 */
878 private static int test(TaxNode[] nodes){
879 int failed=0;
880 for(final TaxNode n : nodes){
881 if(n!=null){
882 TaxNode parent=nodes[n.pid];
883 try {
884 assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) :
885 "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n"
886 + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended;
887 assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent;
888 // assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical() || !parent.canonical()) : n+" -> "+parent;
889 if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){
890 if(verbose){outstream.println("Error: "+n+" -> "+parent);}
891 failed++;
892 }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){
893 // if(verbose){outstream.println("Error: "+n+" -> "+parent);}
894 // failed++;
895 }
896 assert(n!=parent || n.id<=1) : n;
897 } catch (Throwable e) {
898 // TODO Auto-generated catch block
899 e.printStackTrace();
900 failed++;
901 }
902 }
903 }
904 if(verbose || failed>0){outstream.println(failed+" nodes failed.");}
905 return failed;
906 }
907
908 /*--------------------------------------------------------------*/
909 /*---------------- Print Methods ----------------*/
910 /*--------------------------------------------------------------*/
911
912 /**
913 * Format full name in semicolon format, e.g.
914 * "SK:Bacteria;P:Protobacteria;..."
915 * @param tn0 Base node
916 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
917 * @return Resultant String
918 */
919 public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){
920 StringBuilder sb=new StringBuilder();
921 if(tn0==null){return "Not found";}
922 String semi="";
923 ArrayList<TaxNode> list=toAncestors(tn0, skipNonCanonical);
924 boolean addTaxLevel=true;
925 for(int i=list.size()-1; i>=0; i--){
926 sb.append(semi);
927 TaxNode tn=list.get(i);
928 if(tn.id!=LIFE_ID || list.size()==1){
929 if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){
930 sb.append(tn.levelToStringShort()).append(':');
931 }
932 sb.append(mononomial ? mononomial(tn) : tn.name);
933 semi=";";
934 }
935 }
936 return sb.toString();
937 }
938
939 /**
940 * Return a list of TaxIDs of all ancestors.
941 * @param tn0 Base node
942 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
943 * @return List of TaxIDs.
944 */
945 public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){
946 if(tn0==null){return null;}
947 IntList list=new IntList(8);
948
949 TaxNode tn=tn0;
950 while(tn!=null){
951 if(!skipNonCanonical || tn.isSimple()){
952 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);}
953 }
954 if(tn.pid==tn.id){break;}
955 tn=getNode(tn.pid);
956 }
957 if(list.isEmpty()){list.add(tn0.id);}
958 return list;
959 }
960
961 /**
962 * Return a list of all ancestors.
963 * @param tn0 Base node
964 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
965 * @return List of ancestor nodes.
966 */
967 public ArrayList<TaxNode> toAncestors(final TaxNode tn0, boolean skipNonCanonical){
968 if(tn0==null){return null;}
969 ArrayList<TaxNode> list=new ArrayList<TaxNode>(8);
970
971 TaxNode tn=tn0;
972 while(tn!=null){
973 if(!skipNonCanonical || tn.isSimple()){
974 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);}
975 }
976 if(tn.pid==tn.id){break;}
977 tn=getNode(tn.pid);
978 }
979 if(list.isEmpty()){list.add(tn0);}
980 return list;
981 }
982
983 /**
984 * Generate a path to the genome of an organism on the filesystem;
985 * used by ExplodeTree. Intended for internal JGI use.
986 * @param root Location of the exploded tree.
987 * @return Path to a genome.
988 */
989 public String toDir(TaxNode node, String root){
990 StringBuilder sb=new StringBuilder();
991 if(root==null){root="";}
992 sb.append(root);
993 if(root.length()>0 && !root.endsWith("/")){sb.append('/');}
994 IntList list=toAncestorIds(node, false);
995 list.reverse();
996 assert(list.get(0)==1) : list + "," +getNode(list.get(0));
997 for(int i=0; i<list.size(); i++){
998 sb.append(list.get(i));
999 sb.append('/');
1000 }
1001 return sb.toString();
1002 }
1003
1004 /*--------------------------------------------------------------*/
1005 /*---------------- Outer Methods ----------------*/
1006 /*--------------------------------------------------------------*/
1007
1008 /**
1009 * Use various techniques to get a TaxID from an unknown String, such as parsing,
1010 * name lookups, and accession translation.
1011 * @param s String to process.
1012 * @return Decoded TaxID.
1013 */
1014 public static int getID(String s){return GiToTaxid.getID(s);}
1015
1016 /**
1017 * Use various techniques to get a TaxID from an unknown byte[], such as parsing,
1018 * name lookups, and accession translation.
1019 * @param s String to process.
1020 * @return Decoded TaxID.
1021 */
1022 public static int getID(byte[] s){return GiToTaxid.getID(s);}
1023
1024 /** Return the lowest ancestor of the named node with taxonomic level at least minLevel */
1025 public TaxNode getNode(String s, int minLevelExtended){
1026 TaxNode tn=parseNodeFromHeader(s, true);
1027 while(tn!=null && tn.levelExtended<minLevelExtended && tn.pid!=tn.id){
1028 tn=getNode(tn.pid);
1029 }
1030 return tn;
1031 }
1032
1033 /**
1034 * Determine whether a node is a descendant of another.
1035 * @param child Possible child TaxID.
1036 * @param parent Possible parent TaxID.
1037 * @return true iff child descends from parent.
1038 */
1039 public boolean descendsFrom(final int child, final int parent){
1040 TaxNode cn=getNode(child), pn=getNode(parent);
1041 assert(cn!=null) : "Invalid taxID: "+child;
1042 assert(pn!=null) : "Invalid taxID: "+parent;
1043 return descendsFrom(cn, pn);
1044 }
1045
1046 /**
1047 * Determine whether a node is a descendant of another.
1048 * @param child Possible child node.
1049 * @param parent Possible parent node.
1050 * @return true iff child descends from parent.
1051 */
1052 public boolean descendsFrom(TaxNode child, TaxNode parent){
1053 assert(child!=null && parent!=null) : "Null parameters.";
1054 if(child==null || parent==null){return false;}
1055
1056 while(child!=parent && child.levelExtended<=parent.levelExtended && child.id!=child.pid){
1057 child=getNode(child.pid);
1058 }
1059 return child==parent;
1060 }
1061
1062 /**
1063 * Determine whether an organism is classified as X.
1064 * @param taxID taxID of organism.
1065 * @param ancestorID taxID of possible ancestor.
1066 * @return true if the organism is an X.
1067 */
1068 public boolean descendsFrom2(int taxID, final int ancestorID) {
1069 TaxNode tn=getNode(taxID);
1070 while(tn.id!=tn.pid){
1071 if(tn.id==ancestorID){return true;}
1072 tn=getNode(tn.pid);
1073 }
1074 return false;
1075 }
1076
1077 /** Determine whether an organism is classified as a plant. */
1078 public boolean isPlant(int taxID) {return descendsFrom2(taxID, VIRIDIPLANTAE_ID);}
1079
1080 /** Determine whether an organism is classified as an animal. */
1081 public boolean isAnimal(int taxID) {return descendsFrom2(taxID, METAZOA_ID);}
1082
1083 /** Determine whether an organism is classified as a fungus. */
1084 public boolean isFungus(int taxID) {return descendsFrom2(taxID, FUNGI_ID);}
1085
1086 /** Determine whether an organism is classified as a eukaryote. */
1087 public boolean isEukaryote(int taxID) {return descendsFrom2(taxID, EUKARYOTA_ID);}
1088
1089 /** Determine whether an organism is classified as a prokaryote. */
1090 public boolean isProkaryote(int taxID) {
1091 TaxNode tn=getNode(taxID);
1092 if(tn==null){
1093 System.err.println("*** Warning: Can't find node "+taxID+" ***");
1094 return false;
1095 }
1096 while(tn.id!=tn.pid){
1097 if(tn.id==BACTERIA_ID || tn.id==ARCHAEA_ID){return true;}
1098 tn=getNode(tn.pid);
1099 }
1100 return false;
1101 }
1102
1103 /**
1104 * Calculate the common ancestor of two nodes.
1105 * @param a TaxID of a node.
1106 * @param b TaxID of a node.
1107 * @return Common ancestor ID of a and b.
1108 */
1109 public int commonAncestor(final int a, final int b){
1110 TaxNode an=getNode(a), bn=getNode(b);
1111 assert(an!=null) : "Invalid taxID: "+a;
1112 assert(bn!=null) : "Invalid taxID: "+b;
1113 TaxNode cn=commonAncestor(an, bn);
1114 assert(cn!=null) : "No common ancestor: "+an+", "+bn;
1115 if(cn==null){return -1;}
1116 return cn.id;
1117 }
1118
1119 /**
1120 * Calculate the common ancestor of two nodes.
1121 * @param a A node.
1122 * @param b A node.
1123 * @return Common ancestor of a and b.
1124 */
1125 public TaxNode commonAncestor(TaxNode a, TaxNode b){
1126 assert(a!=null && b!=null) : "Null parameters.";
1127 if(a==null){return b;}
1128 if(b==null){return a;}
1129
1130 while(a!=b){
1131 if(a.levelExtended<b.levelExtended){
1132 a=getNode(a.pid);
1133 }else{
1134 b=getNode(b.pid);
1135 }
1136 }
1137 return a;
1138 }
1139
1140 /**
1141 * Identify the highest ancestor of a node;
1142 * this will presumably be "Life".
1143 * @param a Node
1144 * @return Highest ancestor
1145 */
1146 public TaxNode highestAncestor(TaxNode a){
1147 assert(a!=null);
1148 while(a.id!=a.pid){a=getNode(a.pid);}
1149 return a;
1150 }
1151
1152 /*--------------------------------------------------------------*/
1153 /*---------------- Header Parsing ----------------*/
1154 /*--------------------------------------------------------------*/
1155
1156 /**
1157 * Determine the TaxID of a String,
1158 * without a loaded TaxTree.
1159 * This only works if the literal TaxID is embedded in the String.
1160 * @param header Typically a sequence header
1161 * @return Decoded TaxID, or -1 if unsuccessful
1162 */
1163 public static int parseHeaderStatic(String header){
1164 if(header.length()<3){return -1;}
1165 if(header.charAt(0)=='>'){header=header.substring(1);}
1166 if(!header.startsWith("tid|")){return -1;}
1167 int idx=3;
1168 int idx2=header.indexOf('|', 4);
1169 if(idx2<5){return -1;}
1170 int id=-1;
1171 try {
1172 id=Parse.parseInt(header, idx+1, idx2);
1173 // System.err.println("d"+", "+header.substring(idx+1, idx2));
1174 } catch (Throwable e) {
1175 // System.err.println("e"+", "+header.substring(idx+1, idx2));
1176 //ignore
1177 }
1178 return id;
1179 }
1180
1181 /**
1182 * Determine the TaxID of a String.
1183 * @param header Typically a sequence header
1184 * @param bestEffort In some cases, try certain substrings if the name is not found.
1185 * @return
1186 */
1187 public TaxNode parseNodeFromHeader(String header, boolean bestEffort){
1188 if(header==null || header.length()<2){return null;}
1189 if(header.charAt(0)=='>'){header=header.substring(1);}
1190 TaxNode tn;
1191 if(SILVA_MODE){
1192 tn=getNodeSilva(header, bestEffort);
1193 }else if(UNITE_MODE){
1194 tn=getNodeUnite(header, bestEffort);
1195 }else{
1196 final char delimiter=ncbiHeaderDelimiter(header);
1197 if(delimiter==' '){
1198 tn=getNodeNewStyle(header);
1199 }else{
1200 tn=getNodeOldStyle(header, delimiter);
1201 if(tn==null && delimiter=='|'){
1202 // System.err.println("A: "+header);
1203 int id=-1;
1204 String[] split=header.split("\\|");
1205 if(AccessionToTaxid.LOADED()){
1206 for(int i=0; i<split.length && id<0; i++){//Try accessions first
1207 if(AccessionToTaxid.isValidAccession(split[i])){
1208 id=AccessionToTaxid.get(split[i]);
1209 }
1210 }
1211 }
1212 for(int i=0; i<split.length && id<0; i++){//Then names
1213 id=parseNameToTaxid(split[i]);
1214 }
1215 // System.err.println("E: "+id);
1216 if(id>=0){tn=getNode(id);}
1217 // System.err.println("F: "+tn);
1218 }
1219 }
1220 }
1221 return tn;
1222 }
1223
1224 /**
1225 * Guess the delimiter character in a String;
1226 * typically assumed to be '|', '~', or ' '.
1227 */
1228 public static char ncbiHeaderDelimiter(String header){
1229 for(int i=0; i<header.length(); i++){
1230 final char c=header.charAt(i);
1231 if(c=='|' || c=='~'){
1232 assert(i>0) : "i="+i+"; malformatted header '"+header+"'";
1233 return c;
1234 }else if(Character.isWhitespace(c)){
1235 return ' ';
1236 }
1237 }
1238 return ' ';
1239 }
1240
1241 /**
1242 * Parse a Silva header to a Node.
1243 * @param s Silva header.
1244 * @param bestEffort Try certain substrings if the name is not found.
1245 * @return Node
1246 */
1247 TaxNode getNodeSilva(String s, boolean bestEffort){
1248 if(s==null){return null;}
1249 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){
1250 return getNodeOldStyle(s, s.charAt(3));
1251 }
1252 String[] split=Tools.semiPattern.split(s);
1253
1254 int number=-1;
1255 // final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast"));
1256 // if(chloroplast){return null;}
1257 for(int i=split.length-1; number<0 && i>=0; i--){
1258 String last=split[i];
1259 int paren=last.indexOf('(');
1260 if(paren>=0){last=last.substring(0, paren);}
1261 last=last.trim();
1262
1263 if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){
1264 number=parseNameToTaxid(last);
1265 }
1266
1267 if(number>=0){return getNode(number);}
1268 else if(!bestEffort){break;}
1269 }
1270 return null;
1271 }
1272
1273 /**
1274 * Parse a Unite header to a Node.
1275 * @param s Unite header.
1276 * @param bestEffort Try certain substrings if the name is not found.
1277 * @return Node
1278 */
1279 TaxNode getNodeUnite(String s, boolean bestEffort){
1280 if(s==null){return null;}
1281 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){
1282 return getNodeOldStyle(s, s.charAt(3));
1283 }
1284 String[] split=Tools.pipePattern.split(s);
1285
1286 int number=-1;
1287 String name=split[0];
1288 String acc=split[1];
1289 if(AccessionToTaxid.LOADED() && acc.length()>0){
1290 number=AccessionToTaxid.get(acc);
1291 }
1292 if(number<1){
1293 TaxNode tn=getNodeByName(name);
1294 if(tn!=null){number=tn.id;}
1295 }
1296
1297 if(number>=0){return getNode(number);}
1298 return null;
1299 }
1300
1301 /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */
1302 private TaxNode getNodeOldStyle(final String s, char delimiter){
1303 {
1304 int index=s.indexOf(delimiter);
1305 if(index<0){
1306 delimiter='~';
1307 index=s.indexOf(delimiter);
1308 if(index<0){
1309 delimiter='_';
1310 index=s.indexOf(delimiter);
1311 }
1312 }
1313 int number=-1;
1314
1315 Throwable e=null;
1316
1317 if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){
1318 // System.err.println("Parsing gi number.");
1319
1320 if(GiToTaxid.isInitialized()){
1321 try {
1322 number=GiToTaxid.parseGiToTaxid(s, delimiter);
1323 } catch (Throwable e2) {
1324 e=e2;
1325 }
1326 }else{
1327 assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s;
1328 }
1329 // if(number!=-1){System.err.println("number="+number);}
1330 }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){
1331 // System.err.println("Parsing ncbi number.");
1332 number=GiToTaxid.parseTaxidNumber(s, delimiter);
1333 }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){
1334 // System.err.println("Parsing ncbi number.");
1335 long img=parseDelimitedNumber(s, delimiter);
1336 ImgRecord record=imgMap.get(img);
1337 number=(record==null ? -1 : record.taxID);
1338 }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete
1339 // System.err.println("Parsing ncbi number.");
1340 number=GiToTaxid.parseTaxidNumber(s, delimiter);
1341 }
1342
1343 if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){
1344 String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s));
1345 if(AccessionToTaxid.LOADED()){
1346 number=parseAccessionToTaxid(split);
1347 }
1348 if(number<0){
1349 number=parseHeaderNameToTaxid(split);
1350 }
1351 }
1352
1353 if(number<0 && e!=null){
1354 assert(false) : e;
1355 throw new RuntimeException(e);
1356 }
1357
1358 //TaxServer code could go here...
1359
1360 if(number>=0){return getNode(number);}
1361 }
1362 if(verbose){System.err.println("Can't process name "+s);}
1363 if(Tools.isDigit(s.charAt(0)) && s.length()<=9){
1364 try {
1365 return getNode(Integer.parseInt(s));
1366 } catch (NumberFormatException e) {
1367 //ignore
1368 }
1369 }
1370 return null;
1371 }
1372
1373 /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */
1374 static long parseDelimitedNumber(String s, char delimiter){
1375 if(s==null){return -1;}
1376 int i=0;
1377 while(i<s.length() && s.charAt(i)!=delimiter){i++;}
1378 i++;
1379 if(i>=s.length() || !Tools.isDigit(s.charAt(i))){return -1;}
1380
1381 long number=0;
1382 while(i<s.length()){
1383 char c=s.charAt(i);
1384 if(c==delimiter || c==' ' || c=='\t'){break;}
1385 assert(Tools.isDigit(c)) : c+"\n"+s;
1386 number=(number*10)+(c-'0');
1387 i++;
1388 }
1389 return number;
1390 }
1391
1392 /** Parses sequence headers using NCBI's current header system, with Accessions. */
1393 private TaxNode getNodeNewStyle(final String s){
1394
1395 int space=s.indexOf(' ');
1396 int number=-1;
1397
1398 if(AccessionToTaxid.LOADED()){
1399 if(space>0){
1400 number=AccessionToTaxid.get(s.substring(0, space));
1401 }else{
1402 number=AccessionToTaxid.get(s);
1403 }
1404 }
1405
1406 if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){
1407 try {
1408 return getNode(Integer.parseInt(s));
1409 } catch (NumberFormatException e) {
1410 //ignore
1411 }
1412 }
1413
1414 if(number<0 && space>0){
1415 number=parseNameToTaxid(s.substring(space+1));
1416 }
1417
1418 if(number>-1){return getNode(number);}
1419 if(space<0 && s.indexOf('_')>0){
1420 return getNodeNewStyle(s.replace('_', ' '));
1421 }
1422 return null;
1423 }
1424
1425 /**
1426 * For parsing old-style NCBI headers.
1427 */
1428 public int parseAccessionToTaxid(String[] split){
1429 if(split.length<4){
1430 return -1;
1431 }
1432 int ncbi=AccessionToTaxid.get(split[3]);
1433 return ncbi;
1434 }
1435
1436 /**
1437 * For parsing old-style NCBI headers.
1438 */
1439 public int parseHeaderNameToTaxid(String[] split){
1440 if(split.length<5){
1441 return -1;
1442 }
1443 return parseNameToTaxid(split[4]);
1444 }
1445
1446 /**
1447 * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens").
1448 * If multiple nodes share the same name, returns the first; to get the full list,
1449 * use getNodesByNameExtended.
1450 * @param name Organism name.
1451 * @return Organism TaxID, or -1 if not found.
1452 */
1453 public int parseNameToTaxid(String name){
1454 // assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size());
1455 List<TaxNode> list=null;
1456
1457 list=getNodesByNameExtended(name);
1458
1459 if(list==null || list.size()>1){return -1;}
1460 return list.get(0).id;
1461 }
1462
1463 /**
1464 * Fetch nodes indicated by this name.
1465 * @param name A taxonomic name delimited by space or underscore.
1466 * @return Nodes corresponding to the name.
1467 */
1468 public List<TaxNode> getNodesByNameExtended(String name){
1469 List<TaxNode> list=null;
1470
1471 list=getNodesByName(name);
1472 if(list!=null){return list;}
1473
1474 name=name.replaceAll("_", " ").trim();
1475 list=getNodesByName(name);
1476 if(list!=null){return list;}
1477
1478 String[] split2=name.split(" ");
1479
1480 if(split2.length>7){
1481 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7];
1482 list=getNodesByName(term);
1483 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1484 if(list!=null){return list;}
1485 }
1486
1487 if(split2.length>6){
1488 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6];
1489 list=getNodesByName(term);
1490 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1491 if(list!=null){return list;}
1492 }
1493
1494 if(split2.length>5){
1495 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5];
1496 list=getNodesByName(term);
1497 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1498 if(list!=null){return list;}
1499 }
1500 if(split2.length>4){
1501 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4];
1502 list=getNodesByName(term);
1503 // System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1504 if(list!=null){return list;}
1505 }
1506 if(split2.length>3){
1507 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3];
1508 list=getNodesByName(term);
1509 // System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1510 if(list!=null){return list;}
1511 }
1512 if(split2.length>2){
1513 String term=split2[0]+" "+split2[1]+" "+split2[2];
1514 list=getNodesByName(term);
1515 // System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1516 if(list!=null){return list;}
1517 }
1518 if(split2.length>1){
1519 String term=split2[0]+" "+split2[1];
1520 list=getNodesByName(term);
1521 // System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1522 if(list!=null){return list;}
1523 }
1524 if(split2.length>0){
1525 String term=split2[0];
1526 list=getNodesByName(term);
1527 // System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
1528 if(list!=null){return list;}
1529 }
1530
1531 return null;
1532 }
1533
1534 /*--------------------------------------------------------------*/
1535 /*---------------- Assorted Methods ----------------*/
1536 /*--------------------------------------------------------------*/
1537
1538 /**
1539 * Return the TaxID of the lowest ancestor node at least the specified level,
1540 * including this node itself. Level is the normal (non-extended) level.
1541 * @param taxID
1542 * @param taxLevel
1543 * @return
1544 */
1545 public int promote(final int taxID, int taxLevel){
1546 TaxNode tn=null;
1547 tn=(taxID<1 ? null : getNode(taxID));
1548 tn=promote(tn, taxLevel);
1549 return (tn==null ? taxID : tn.id);
1550 }
1551
1552 /**
1553 * Fetch the first node in this node's lineage of at least the indicated level.
1554 * This can be the node itself or an ancestor.
1555 * @see getNodeAtLevelExtended
1556 * @param tn Node in question
1557 * @param taxLevel Desired minimum level
1558 * @return A node at the desired level
1559 */
1560 public TaxNode promote(TaxNode tn, int taxLevel){
1561 while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){
1562 TaxNode temp=getNode(tn.pid);
1563 if(temp==null || temp==tn || temp.level>=TaxTree.LIFE || temp.level>taxLevel){break;}
1564 tn=temp;
1565 }
1566 return tn;
1567 }
1568
1569 /**
1570 * Determine the TaxID of the node's parent.
1571 * @param id TaxID of child node
1572 * @return Parent TaxID
1573 */
1574 public int getParentID(int id){
1575 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
1576 + "\nPlease redownload it and regenerate the taxtree.";
1577 if(id<0 || id>=nodes.length){return -1;}
1578 TaxNode tn=nodes[id];
1579 if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);}
1580 return tn==null ? -1 : tn.pid;
1581 }
1582
1583 /**
1584 * Fetch the node with this TaxID.
1585 * @param id TaxID
1586 * @return Node
1587 */
1588 public TaxNode getNode(int id){
1589 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
1590 + "\nPlease redownload it and regenerate the taxtree.";
1591 if(id<0 || id>=nodes.length){return null;}
1592 TaxNode tn=nodes[id];
1593 if(tn!=null || mergedMap==null){return tn;}
1594 return getNode(mergedMap.get(id), true);
1595 }
1596
1597 /**
1598 * Fetch the node with this TaxID, but don't throw assertions upon failure.
1599 * @param id TaxID
1600 * @return Node
1601 */
1602 public TaxNode getNode(int id, boolean skipAssertion){
1603 assert(skipAssertion || id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
1604 + "\nPlease redownload it and regenerate the taxtree.";
1605 if(id<0 || id>=nodes.length){return null;}
1606 TaxNode tn=nodes[id];
1607 if(tn!=null || mergedMap==null){return tn;}
1608 return getNode(mergedMap.get(id), true);
1609 }
1610
1611 public TaxNode getNodeAtLevel(int id, int minLevel){
1612 return getNodeAtLevel(id, minLevel, DOMAIN);
1613 }
1614
1615 public TaxNode getNodeAtLevelExtended(int id, int minLevelE){
1616 return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E);
1617 }
1618
1619 public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){
1620 final int minLevelExtended=levelToExtended(minLevel);
1621 final int maxLevelExtended=levelToExtended(maxLevel);
1622 return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended);
1623 }
1624
1625 public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){
1626 TaxNode tn=getNode(id);
1627 while(tn!=null && tn.pid!=tn.id && tn.levelExtended<minLevelE){
1628 TaxNode temp=getNode(tn.pid);
1629 if(temp==null || temp.levelExtended>maxLevelE){break;}
1630 tn=temp;
1631 }
1632 return tn;
1633 }
1634
1635 public int getIdAtLevelExtended(int taxID, int taxLevelExtended){
1636 if(taxLevelExtended<0){return taxID;}
1637 TaxNode tn=getNode(taxID);
1638 while(tn!=null && tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){
1639 tn=getNode(tn.pid);
1640 if(tn.levelExtended>taxLevelExtended){break;}
1641 taxID=tn.id;
1642 }
1643 return taxID;
1644 }
1645
1646 /**
1647 * Fetch the node with this name.
1648 * Throw an assertion if there are multiple such nodes.
1649 * @param s Organism name.
1650 * @return Node with given name.
1651 */
1652 public TaxNode getNodeByName(String s){
1653 List<TaxNode> list=getNodesByName(s, false);
1654 if(list==null){list=getNodesByName(s, true);}
1655 if(list==null || list.isEmpty()){return null;}
1656 if(list.size()==1){return list.get(0);}
1657 assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n";
1658 TaxNode a=list.get(0);
1659 for(int i=1; i<list.size(); i++){
1660 TaxNode b=list.get(i);
1661 //Keep the most specific node
1662 // if(a==null || (b!=null && b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf())){//not necessary
1663 if(b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf()){
1664 a=b;
1665 }
1666 }
1667 return a;
1668 }
1669
1670 /**
1671 * Fetch a list of all nodes with this name.
1672 * @param s Organism name.
1673 * @return Nodes with given name.
1674 */
1675 public List<TaxNode> getNodesByName(String s){
1676 List<TaxNode> list=getNodesByName(s, false);
1677 if(list==null){list=getNodesByName(s, true);}
1678 return list;
1679 }
1680
1681 /**
1682 * Fetch a map of names to nodes. If absent, create it first.
1683 * @param lowercase If true, return the map with lowercase keys.
1684 * @return Map of names to nodes.
1685 */
1686 private HashMap<String, ArrayList<TaxNode>> getMap(boolean lowercase){
1687 HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap);
1688 if(map==null){
1689 synchronized(this){hashNames(true);}
1690 map=(lowercase ? nameMapLower : nameMap);
1691 }
1692 assert(map!=null) : "Tax names were not hashed.";
1693 return map;
1694 }
1695
1696 private List<TaxNode> getNodesByName(String s, boolean lowercase){
1697 if(s==null){return null;}
1698 if(s.indexOf('_')>=0){s=s.replace('_', ' ');}
1699 if(lowercase){s=s.toLowerCase();}
1700 // System.err.println("Searching for "+s);
1701 final HashMap<String, ArrayList<TaxNode>> map=getMap(lowercase);
1702 ArrayList<TaxNode> list=map.get(s);
1703 if(list!=null){return list;}
1704 // System.err.println("No matches for '"+s+"'");
1705
1706 // assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s);
1707
1708 if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;}
1709 String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8);
1710 // System.err.println("Array: "+Arrays.toString(split));
1711 list=map.get(split[split.length-1]);
1712 if(list==null){return list;}
1713 // System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")");
1714
1715 int matchCount=0;
1716 for(TaxNode tn : list){
1717 if(tn.matchesName(split, split.length-1, this)){matchCount++;}
1718 }
1719 if(matchCount==list.size()){return list;}
1720 if(matchCount<1){return null;}
1721 ArrayList<TaxNode> hits=new ArrayList<TaxNode>(matchCount);
1722 for(TaxNode tn : list){
1723 if(tn.matchesName(split, split.length-1, this)){hits.add(tn);}
1724 }
1725 return hits;
1726 }
1727 public ArrayList<TaxNode> getAncestors(int id){
1728 TaxNode current=getNode(id);
1729 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
1730 while(current!=null && current.pid!=current.id){//ignores root
1731 list.add(current);
1732 current=getNode(current.pid);
1733 }
1734 //optionally add root here
1735 return list;
1736 }
1737
1738 public void increment(IntList ids, IntList counts, boolean sync){
1739
1740 ids.sort();
1741 ids.getUniqueCounts(counts);
1742
1743 if(!sync){
1744 for(int i=0; i<ids.size; i++){
1745 int id=ids.get(i);
1746 int count=counts.get(i);
1747 incrementRaw(id, count);
1748 }
1749 }else{
1750 synchronized(this){
1751 for(int i=0; i<ids.size; i++){
1752 int id=ids.get(i);
1753 int count=counts.get(i);
1754 incrementRaw(id, count);
1755 }
1756 }
1757 }
1758 }
1759
1760 public void incrementRaw(int id, long amt){
1761 assert(id>=0 && id<nodes.length) : "TaxID "+id+" is out of range."+(id<0 ? "" : " Possibly the taxonomy data needs to be updated.");
1762 assert(nodes[id]!=null) : "No node for TaxID "+id+"; possibly the taxonomy data needs to be updated.";
1763 nodes[id].incrementRaw(amt);
1764 }
1765
1766 public void percolateUp(){
1767 for(int i=0; i<treeLevelsExtended.length; i++){
1768 percolateUp(i);
1769 }
1770 }
1771
1772 public void percolateUp(final int fromLevel){
1773 final TaxNode[] stratum=treeLevelsExtended[fromLevel];
1774 for(final TaxNode n : stratum){
1775 n.incrementSum(n.countRaw);
1776 TaxNode parent=nodes[n.pid];
1777 if(n!=parent){
1778 parent.incrementSum(n.countSum);
1779 }
1780 }
1781 }
1782
1783 /** Add this amount to the node and all its ancestors. */
1784 public void percolateUp(TaxNode node, long amt){
1785 if(amt==0){return;}
1786 if(verbose){System.err.println("percolateUp("+amt+") node: "+node);}
1787 while(node.id!=node.pid){
1788 node.incrementSum(amt);
1789 node=nodes[node.pid];
1790 }
1791 node.incrementSum(amt);
1792 }
1793
1794 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){
1795 return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1);
1796 }
1797
1798 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){
1799 final int minLevelExtended=levelToExtended(minLevel);
1800 final int maxLevelExtended=levelToExtended(maxLevel);
1801 // assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended;
1802 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
1803 for(int i=minLevelExtended; i<nodesPerLevelExtended.length && i<=maxLevelExtended; i++){
1804 list.addAll(gatherNodesAtLeastLimitExtended(i, limit));
1805 }
1806 Shared.sort(list, TaxNode.countComparator);
1807 return list;
1808 }
1809
1810 public ArrayList<TaxNode> gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){
1811 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
1812 final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended];
1813 for(final TaxNode n : stratum){
1814 if(n.countSum>=limit){
1815 list.add(n);
1816 TaxNode parent=nodes[n.pid];
1817 if(n!=parent){
1818 percolateUp(parent, -n.countSum);//123 This was negative for some reason
1819 }
1820 }
1821 }
1822 Shared.sort(list, TaxNode.countComparator);
1823 return list;
1824 }
1825
1826 /*--------------------------------------------------------------*/
1827 /*---------------- Static Initializers ----------------*/
1828 /*--------------------------------------------------------------*/
1829
1830 /**
1831 * Generate the name to level number map.
1832 */
1833 private static HashMap<String, Integer> makeLevelMap() {
1834 HashMap<String, Integer> map=new HashMap<String, Integer>(31);
1835 for(int i=0; i<taxLevelNames.length; i++){
1836 map.put(taxLevelNames[i], i);
1837 map.put(taxLevelNames[i].toUpperCase(), i);
1838 }
1839 map.put("clade", NO_RANK);
1840 map.put("clade".toUpperCase(), NO_RANK);
1841 return map;
1842 }
1843
1844 /**
1845 * Generate the name to extended level number map.
1846 */
1847 private static HashMap<String, Integer> makeLevelMapExtended() {
1848 HashMap<String, Integer> map=new HashMap<String, Integer>(129);
1849 for(int i=0; i<taxLevelNamesExtended.length; i++){
1850 map.put(taxLevelNamesExtended[i], i);
1851 map.put(taxLevelNamesExtended[i].toUpperCase(), i);
1852 }
1853 map.put("clade", NO_RANK_E);
1854 map.put("clade".toUpperCase(), NO_RANK_E);
1855 return map;
1856 }
1857
1858 /**
1859 * I think this maps normal and extend names to normal level numbers.
1860 */
1861 private static HashMap<String, Integer> makeAltLevelMap() {
1862 HashMap<String, Integer> map=new HashMap<String, Integer>(129);
1863 for(int i=0; i<taxLevelNames.length; i++){
1864 map.put(taxLevelNames[i], i);
1865 map.put(taxLevelNames[i].toUpperCase(), i);
1866 }
1867 map.put("clade", NO_RANK);
1868 map.put("clade".toUpperCase(), NO_RANK);
1869
1870 //Add synonyms
1871 // map.put("subfamily", map.get("family"));
1872 // map.put("tribe", map.get("family"));
1873 // map.put("varietas", map.get("subspecies"));
1874 // map.put("subgenus", map.get("genus"));
1875 // map.put("forma", map.get("subspecies"));
1876 // map.put("species group", map.get("genus"));
1877 // map.put("species subgroup", map.get("genus"));
1878 // map.put("cohort", map.get("class"));
1879 // map.put("subclass", map.get("class"));
1880 // map.put("infraorder", map.get("order"));
1881 // map.put("superorder", map.get("class"));
1882 // map.put("subphylum", map.get("phylum"));
1883 // map.put("infraclass", map.get("class"));
1884 // map.put("superkingdom", map.get("division"));
1885 // map.put("parvorder", map.get("order"));
1886 // map.put("superclass", map.get("phylum"));
1887 // map.put("superphylum", map.get("kingdom"));
1888 // map.put("subkingdom", map.get("kingdom"));
1889 // map.put("superfamily", map.get("order"));
1890 // map.put("superkingdom", map.get("domain"));
1891 // map.put("suborder", map.get("order"));
1892 // map.put("subtribe", map.get("family"));
1893
1894 for(String[] array : taxLevelNamesExtendedMatrix){
1895 String head=array[array.length-1];
1896 Integer value=map.get(head);
1897 assert(value!=null) : head;
1898 for(String key : array){
1899 if(key!=head){
1900 assert(!map.containsKey(key)) : "Map already contains key "+key+": "+Arrays.toString(array);
1901 map.put(key, value);
1902 map.put(key.toUpperCase(), value);
1903 }
1904 }
1905 }
1906
1907 return map;
1908 }
1909
1910 /*--------------------------------------------------------------*/
1911 /*---------------- Size ----------------*/
1912 /*--------------------------------------------------------------*/
1913
1914 /** Number of bp associated with this node in RefSeq */
1915 public long toSize(TaxNode tn){
1916 if(tn==null){return 0;}
1917 if(refseqSizeMap==null){return -1L;}
1918 final long x=refseqSizeMap.get(tn.id);
1919 return Tools.max(0, x);
1920 }
1921
1922 /** Number of bp associated with this node and descendants in RefSeq */
1923 public long toSizeC(TaxNode tn){
1924 if(tn==null){return 0;}
1925 if(refseqSizeMapC==null){return -1L;}
1926 final long x=refseqSizeMapC.get(tn.id);
1927 return Tools.max(0, x);
1928 }
1929
1930 /** Number of sequences associated with this node in RefSeq */
1931 public int toSeqs(TaxNode tn){
1932 if(tn==null){return 0;}
1933 if(refseqSeqMap==null){return -1;}
1934 final int x=refseqSeqMap.get(tn.id);
1935 return Tools.max(0, x);
1936 }
1937
1938 /** Number of sequences associated with this node and descandants in RefSeq */
1939 public long toSeqsC(TaxNode tn){
1940 if(tn==null){return 0;}
1941 if(refseqSeqMapC==null){return -1L;}
1942 final long x=refseqSeqMapC.get(tn.id);
1943 return Tools.max(0, x);
1944 }
1945
1946 /** Number of descendants of this node */
1947 public int toNodes(TaxNode tn){
1948 if(tn==null){return 0;}
1949 if(nodeMapC==null){return -1;}
1950 final int x=nodeMapC.get(tn.id);
1951 return Tools.max(0, x);
1952 }
1953
1954 /**
1955 * Fills refseqSizeMap, refseqSizeMapC, etc. from a file containing the summary.
1956 * @param fname Size file name
1957 */
1958 public void loadSizeFile(String fname){
1959 if(fname==null){return;}
1960 assert(refseqSizeMap==null);
1961 refseqSizeMap=new IntLongHashMap();
1962 refseqSizeMapC=new IntLongHashMap();
1963 refseqSeqMap=new IntHashMap();
1964 refseqSeqMapC=new IntLongHashMap();
1965 nodeMapC=new IntHashMap();
1966
1967 final ByteFile bf=ByteFile.makeByteFile(fname, true);
1968 final byte delimiter='\t';
1969 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
1970 if(line.length>0 && line[0]!='#'){
1971 int a=0, b=0;
1972
1973 while(b<line.length && line[b]!=delimiter){b++;}
1974 assert(b>a) : "Missing field 0: "+new String(line);
1975 int tid=Parse.parseInt(line, a, b);
1976 b++;
1977 a=b;
1978
1979 while(b<line.length && line[b]!=delimiter){b++;}
1980 assert(b>a) : "Missing field 1: "+new String(line);
1981 long size=Parse.parseLong(line, a, b);
1982 b++;
1983 a=b;
1984
1985 while(b<line.length && line[b]!=delimiter){b++;}
1986 assert(b>a) : "Missing field 2: "+new String(line);
1987 long csize=Parse.parseLong(line, a, b);
1988 b++;
1989 a=b;
1990
1991 while(b<line.length && line[b]!=delimiter){b++;}
1992 assert(b>a) : "Missing field 3: "+new String(line);
1993 int seqs=Parse.parseInt(line, a, b);
1994 b++;
1995 a=b;
1996
1997 while(b<line.length && line[b]!=delimiter){b++;}
1998 assert(b>a) : "Missing field 4: "+new String(line);
1999 long cseqs=Parse.parseLong(line, a, b);
2000 b++;
2001 a=b;
2002
2003 while(b<line.length && line[b]!=delimiter){b++;}
2004 assert(b>a) : "Missing field 5: "+new String(line);
2005 int cnodes=Parse.parseInt(line, a, b);
2006 b++;
2007 a=b;
2008
2009 if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);}
2010 if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);}
2011 if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);}
2012 if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);}
2013 if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);}
2014 }
2015 }
2016 bf.close();
2017 }
2018
2019 /*--------------------------------------------------------------*/
2020 /*---------------- IMG ----------------*/
2021 /*--------------------------------------------------------------*/
2022
2023 public static int imgToTaxid(long img){
2024 ImgRecord ir=imgMap.get(img);
2025 // assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n";
2026 return ir==null ? -1 : ir.taxID;
2027 }
2028
2029 public TaxNode imgToTaxNode(long img){
2030 int tid=imgToTaxid(img);
2031 return tid<1 ? null : getNode(tid);
2032 }
2033
2034 // public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){
2035 // assert(imgMap==null);
2036 // if(fname==null){return 0;}
2037 // ImgRecord2.storeName=storeName;
2038 // if(outstream!=null){System.err.println("Loading IMG.");}
2039 // Timer t=new Timer(outstream, false);
2040 // ImgRecord2[] array=ImgRecord2.toArray(fname);
2041 // int x=loadIMG(array);
2042 // t.stopAndPrint();
2043 // return x;
2044 // }
2045
2046 public static int loadIMG(String fname, boolean storeName, PrintStream outstream){
2047 assert(imgMap==null);
2048 if(fname==null){return 0;}
2049 ImgRecord.storeName=storeName;
2050 if(outstream!=null){System.err.println("Loading IMG.");}
2051 Timer t=new Timer(outstream, false);
2052 ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ);
2053 int x=loadIMG(array);
2054 t.stopAndPrint();
2055 return x;
2056 }
2057
2058 public static int loadIMG(ImgRecord[] array){
2059 assert(imgMap==null);
2060 imgMap=new HashMap<Long, ImgRecord>((int)(array.length*1.5));
2061 for(ImgRecord record : array){
2062 imgMap.put(record.imgID, record);
2063 }
2064 return imgMap.size();
2065 }
2066
2067 @Deprecated
2068 public static int parseLevel(String b){
2069 final int level;
2070 if(b==null){level=-1;}
2071 else if(Tools.isNumeric(b.charAt(0))){
2072 level=Integer.parseInt(b);
2073 }else{
2074 level=stringToLevel(b.toLowerCase());
2075 }
2076 return level;
2077 }
2078
2079 public static int parseLevelExtended(String b){
2080 final int level;
2081 if(b==null){level=-1;}
2082 else if(Tools.isNumeric(b.charAt(0))){
2083 level=levelToExtended(Integer.parseInt(b));
2084 }else{
2085 level=stringToLevelExtended(b.toLowerCase());
2086 }
2087 return level;
2088 }
2089
2090 public boolean isUnclassified(int tid){
2091 TaxNode tn=getNode(tid);
2092 while(tn!=null && tn.id!=tn.pid){
2093 if(tn.isUnclassified()){return true;}
2094 if(tn.pid==tn.id){break;}
2095 tn=getNode(tn.pid);
2096 }
2097 return false;
2098 }
2099
2100 public boolean isEnvironmentalSample(int tid){
2101 TaxNode tn=getNode(tid);
2102 while(tn!=null && tn.id!=tn.pid){
2103 if(tn.isEnvironmentalSample()){return true;}
2104 if(tn.pid==tn.id){break;}
2105 tn=getNode(tn.pid);
2106 }
2107 return false;
2108 }
2109
2110 public boolean isVirus(int tid){
2111 TaxNode tn=getNode(tid);
2112 while(tn!=null && tn.id!=tn.pid){
2113 if(tn.id==VIRUSES_ID){return true;}
2114 if(tn.pid==tn.id){break;}
2115 tn=getNode(tn.pid);
2116 }
2117 return false;
2118 }
2119
2120 public long definedLevels(int tid){
2121 long levels=0;
2122 TaxNode tn=getNode(tid);
2123 while(tn!=null && tn.id!=tn.pid){
2124 levels=levels|(1L<<tn.level);
2125 }
2126 return levels;
2127 }
2128
2129 public long definedLevelsExtended(int tid){
2130 long levels=0;
2131 TaxNode tn=getNode(tid);
2132 while(tn!=null && tn.id!=tn.pid){
2133 levels=levels|(1L<<tn.levelExtended);
2134 }
2135 return levels;
2136 }
2137
2138 /**
2139 * Generates the mononomial name for this taxonomic level based on the scientific name.
2140 * For example, "Homo sapiens" -> "Sapiens"
2141 * @param tid TaxID
2142 * @return Correct name for this node.
2143 */
2144 public String mononomial(int tid){return mononomial(getNode(tid));}
2145 public String mononomial(TaxNode tn){
2146 if(tn==null){return null;}
2147 String name=tn.name;
2148 if(name.indexOf(' ')<0){return name;}
2149 TaxNode parent=getNode(tn.pid);
2150 if(parent==null){return name;}
2151 String pname=parent.name;
2152 if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){
2153 name=name.substring(pname.length()+1);
2154 }
2155 return name;
2156 }
2157
2158 /*--------------------------------------------------------------*/
2159 /*---------------- Fields ----------------*/
2160 /*--------------------------------------------------------------*/
2161
2162 /** All nodes in the tree in a flat array, indexed by TaxiD */
2163 public final TaxNode[] nodes;
2164
2165 /** Number of nodes per normal level */
2166 public final int[] nodesPerLevel=new int[taxLevelNames.length];
2167
2168 /** Number of nodes per extended level */
2169 public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length];
2170
2171 /** Number of nodes in the tree */
2172 public final int nodeCount;
2173
2174 /** Maps old TaxIDs to new TaxIDs */
2175 public final IntHashMap mergedMap;
2176
2177 /** Arrays of all nodes at a given taxonomic level (extended) */
2178 public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][];
2179
2180 /** Map of names to nodes */
2181 HashMap<String, ArrayList<TaxNode>> nameMap;
2182 /** Map of lowercase names to nodes */
2183 HashMap<String, ArrayList<TaxNode>> nameMapLower;
2184 /** Map of nodes to child nodes */
2185 HashMap<TaxNode, ArrayList<TaxNode>> childMap;
2186 public HashMap<String, ArrayList<TaxNode>> nameMap(){return nameMap;}
2187
2188 @Deprecated
2189 public int minValidTaxa=0; //TODO: Remove (will break serialization)
2190
2191 /** Infer ranks for no-rank nodes, when possible */
2192 public boolean simplify=true;
2193 /** See simplify() for details, works in conjunction with simplify */
2194 public boolean reassign=true;
2195 /** Discard no-rank nodes */
2196 public boolean skipNorank=false;
2197 public int inferRankLimit=0;//levelMap.get("species");
2198
2199 //Node Statistics
2200 /** Number of bases assigned to this TaxID in RefSeq */
2201 private IntLongHashMap refseqSizeMap;
2202 /** Number of bases assigned to this TaxID and descendants in RefSeq */
2203 private IntLongHashMap refseqSizeMapC;
2204 /** Number of sequences assigned to this TaxID in RefSeq */
2205 private IntHashMap refseqSeqMap;
2206 /** Number of sequences assigned to this TaxID and descendants in RefSeq */
2207 private IntLongHashMap refseqSeqMapC;
2208 /** Number of descendant nodes, inclusive, for each TaxID */
2209 private IntHashMap nodeMapC;
2210
2211 /*--------------------------------------------------------------*/
2212 /*---------------- Statics ----------------*/
2213 /*--------------------------------------------------------------*/
2214
2215 /** Assign levels to unranked nodes below species level, when possible */
2216 public static boolean assignStrains=true;
2217 /** Assume headers are in Silva format */
2218 public static boolean SILVA_MODE=false;
2219 /** Assume headers are in Unite format */
2220 public static boolean UNITE_MODE=false;
2221 /** Probably unnecessary at this point... present for legacy reasons */
2222 public static boolean CRASH_IF_NO_GI_TABLE=true;
2223
2224 public static boolean verbose=false;
2225 public static boolean SHOW_WARNINGS=false;
2226
2227 /** Maps IMG IDs to records from the dump file */
2228 private static HashMap<Long, ImgRecord> imgMap;
2229
2230 /** Set to false if the tree is expected to be mutated.
2231 * @TODO Remove mutable fields from the tree (like counters).
2232 */
2233 public static boolean ALLOW_SHARED_TREE=true;
2234
2235 /** Universal location of the shared TaxTree used by various classes */
2236 private static TaxTree sharedTree;
2237
2238 /** A simpler and probably less safe version of sharedTree(...) */
2239 public static TaxTree getTree(){return sharedTree;}
2240
2241 /**
2242 * Fetch the shared tree, loading it from file if not present.
2243 * @return A tree.
2244 * @TODO: Check proper-construction of double-checked synchronize
2245 */
2246 private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) {
2247 if(!ALLOW_SHARED_TREE){return null;}
2248 if(sharedTree==null && fname!=null){
2249 if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();}
2250 synchronized(TaxTree.class){
2251 if(sharedTree==null){
2252 if(outstream!=null){outstream.println("Loading tax tree.");}
2253 Timer t=new Timer(outstream, false);
2254 setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat);
2255 t.stopAndPrint();
2256 }
2257 }
2258 }
2259 if(hashNames && sharedTree.nameMap==null){
2260 synchronized(sharedTree){
2261 if(sharedTree.nameMap==null){
2262 if(outstream!=null){outstream.println("Hashing names.");}
2263 Timer t=new Timer(outstream, false);
2264 sharedTree.hashNames(hashDotFormat);
2265 t.stopAndPrint();
2266 }
2267 }
2268 }
2269 return sharedTree;
2270 }
2271
2272 /**
2273 * For initialization. Normally only one tree is needed by a process so it is set here.
2274 * If the tree is already set nothing will happen, unless additional hashing is needed.
2275 */
2276 private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){
2277 assert(ALLOW_SHARED_TREE);
2278 assert(sharedTree==null);
2279 sharedTree=tree;
2280 if(hashNames && sharedTree.nameMap==null){
2281 synchronized(sharedTree){
2282 if(sharedTree.nameMap==null){
2283 sharedTree.hashNames(hashDotFormat);
2284 }
2285 }
2286 }
2287 }
2288
2289 /**
2290 * Determine whether a taxonomic level is standard. e.g.:<br>
2291 * isSimple("phylum")=true<br>
2292 * isSimple("subphylum")=false<br>
2293 * isSimple("no-rank")=false
2294 * @param levelExtended The extended level to test.
2295 * @return True if this level is not no-rank, and the names of the normal and extended levels match.
2296 */
2297 public static boolean isSimple(int levelExtended){
2298 int level=extendedToLevel(levelExtended);
2299 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level));
2300 }
2301
2302 /**
2303 * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:<br>
2304 * isSimple("phylum")=true<br>
2305 * isSimple("substrain")=true<br>
2306 * isSimple("subphylum")=false<br>
2307 * isSimple("no-rank")=false
2308 * @param levelExtended The extended level to test.
2309 * @return True if this level is not no-rank, and the names of the normal and extended levels match.
2310 */
2311 public static boolean isSimple2(int levelExtended){
2312 int level=extendedToLevel(levelExtended);
2313 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)
2314 || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E);
2315 }
2316
2317 /*--------------------------------------------------------------*/
2318 /*---------------- Constants ----------------*/
2319 /*--------------------------------------------------------------*/
2320
2321 /** Get the number for the normal level of this name */
2322 public static final int stringToLevel(String s){return altLevelMap.get(s);}
2323 public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);}
2324 /** Get the number for the extended level of this name */
2325 public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);}
2326 /** Get the normal name for this normal level */
2327 public static final String levelToString(int x){return taxLevelNames[x];}
2328 /** Get the extended name for this extended level */
2329 public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];}
2330 /** Get the abbreviated name for this normal level */
2331 public static final String levelToStringShort(int x){return taxLevelNamesShort[x];}
2332
2333 /** Normal, aka canonical, aka simple tax level names */
2334 private static final String[] taxLevelNames=new String[] {
2335 "no rank", "subspecies", "species", "genus",
2336 "family", "order", "class", "phylum",
2337 "kingdom", "superkingdom", "domain", "life"
2338 };
2339 public static final int numTaxLevelNames=taxLevelNames.length;
2340
2341 /**
2342 * Definitive representation of all NCBI taxonomic level names.
2343 * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash.
2344 * The first dimension maps normal ranks to extended ranks.
2345 * Both dimensions are ordered ascending.
2346 * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels().
2347 */
2348 //TODO See @TODO
2349 private static final String[][] taxLevelNamesExtendedMatrix=new String[][] {
2350 {"no rank"},
2351 {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup",
2352 "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas",
2353 "subspecies"},
2354 {"species"},
2355 {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"},
2356 {"subtribe", "tribe", "subfamily", "family"},
2357 {"superfamily", "parvorder", "infraorder", "suborder", "order"},
2358 {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"},
2359 {"superclass", "subdivision", "division", "subphylum", "phylum"},
2360 {"superphylum", "subkingdom", "kingdom"},
2361 {"superkingdom"},
2362 {"domain"},
2363 {"life"}
2364 };
2365
2366 /** Extended tax level names as a 1D array */
2367 private static final String[] taxLevelNamesExtended=makeNamesExtended();
2368 /** Number of extended tax levels */
2369 public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length;
2370
2371 /** Flatten the extended tax level names matrix to a 1D array */
2372 private static final String[] makeNamesExtended(){
2373 ArrayList<String> list=new ArrayList<String>();
2374 for(String[] s : taxLevelNamesExtendedMatrix){
2375 for(String ss : s){
2376 list.add(ss);
2377 }
2378 }
2379 return list.toArray(new String[0]);
2380 }
2381
2382 /** Abbreviations of tax level names, mainly for semicolon form */
2383 private static final String[] taxLevelNamesShort=new String[] {
2384 "nr", "ss", "s", "g",
2385 "f", "o", "c", "p",
2386 "k", "sk", "d", "l"
2387 };
2388
2389 /** Normal tax level numbers as constants */
2390 public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3,
2391 FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11;
2392
2393 /** TaxID of Life node */
2394 public static final int LIFE_ID=1;
2395 /** TaxID of Cellular Organisms node */
2396 public static final int CELLULAR_ORGANISMS_ID=131567;
2397 /** TaxID of Bacteria node */
2398 public static final int BACTERIA_ID=2; //Is this safe? Who knows...
2399 /** TaxID of Archaea node */
2400 public static final int ARCHAEA_ID=2157;
2401 /** TaxID of Euk node */
2402 public static final int EUKARYOTA_ID=2759;
2403 /** TaxID of Animal node */
2404 public static final int METAZOA_ID=33208, ANIMALIA_ID=33208;
2405 /** TaxID of Plant node */
2406 public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090;
2407 /** TaxID of Fungi node */
2408 public static final int FUNGI_ID=4751;
2409 /** TaxID of Virus node */
2410 public static final int VIRUSES_ID=10239;
2411 /** TaxID of Viroids node (now defunct) */
2412 public static final int VIROIDS_ID=12884;
2413
2414 /** Maps normal level names to normal level numbers */
2415 private static final HashMap<String, Integer> levelMap=makeLevelMap();
2416 /** Maps extended level names to extended level numbers */
2417 private static final HashMap<String, Integer> levelMapExtended=makeLevelMapExtended();
2418 /** Maps extended level names to normal level numbers */
2419 private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap();
2420
2421 /** Common extended level numbers as constants */
2422 public static final int NO_RANK_E=NO_RANK,
2423 SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"),
2424 SUBSPECIES_E=stringToLevelExtended("subspecies"),
2425 SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"),
2426 FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"),
2427 CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"),
2428 KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"),
2429 DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life");
2430
2431 /** Map of normal to extended level numbers */
2432 private static final int[] levelToExtended=new int[] {
2433 NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E,
2434 ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E
2435 };
2436
2437 /** Map of extended to normal level numbers */
2438 private static final int[] extendedToLevel=makeExtendedToLevel();
2439
2440 /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */
2441 private static int[] makeExtendedToLevel(){
2442 int len=0;
2443 for(String[] array : taxLevelNamesExtendedMatrix){
2444 len+=array.length;
2445 }
2446 int[] ret=new int[len];
2447
2448 int pos=0;
2449 for(int level=0; level<taxLevelNamesExtendedMatrix.length; level++){
2450 String[] array=taxLevelNamesExtendedMatrix[level];
2451 for(int i=0; i<array.length; i++){
2452 ret[pos]=level;
2453 pos++;
2454 }
2455 }
2456 return ret;
2457 }
2458
2459 /** Convert a standard level number (like KINGDOM) to extended (like KINGDOM_E). */
2460 public static final int levelToExtended(int level){
2461 return level<0 ? level : levelToExtended[level];
2462 }
2463
2464 /** Convert an extended level number (like PHYLUM_E) to extended (like PHYLUM).
2465 * Non-standard levels will be converted to the next higher standard level;
2466 * e.g., subphylum -> phylum */
2467 public static final int extendedToLevel(int extended){
2468 return extended<0 ? -1 : extendedToLevel[extended];
2469 }
2470
2471 /* Pre-compiled delimiters to save time when splitting lines */
2472 private static final Pattern delimiterTab = Pattern.compile("\t");
2473 private static final Pattern delimiter = Pattern.compile("\t\\|\t");
2474 private static final Pattern delimiterPipe = Pattern.compile("\\|");
2475 private static final Pattern delimiterTilde = Pattern.compile("\\~");
2476 private static final Pattern delimiter2 = Pattern.compile("[\\s_]+");
2477
2478 public static boolean IMG_HQ=false;
2479
2480 /* For these fields, see the corresponding functions, below.
2481 * They define the default paths to various data on NERSC. */
2482
2483 private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest";
2484 private static final String defaultTaxPathAws="/test1/tax/latest";
2485 private static final String defaultTaxPathIGBVM="/data/tax/latest";
2486 private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
2487 private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
2488 private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
2489 private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
2490 private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz";
2491 private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
2492
2493 private static final String defaultImgFile="TAX_PATH/imgDump.txt";
2494 private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz";
2495 private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz";
2496 private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz";
2497 private static final String defaultPatternFile="TAX_PATH/patterns.txt";
2498 private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz";
2499
2500 private static final String defaultAccessionFile=
2501 //"TAX_PATH/shrunk.protF.accession2taxid.gz," +
2502 "TAX_PATH/shrunk.prot.accession2taxid.gz,"
2503 + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz,"
2504 + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz,"
2505 + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz,"
2506 // + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz,"
2507 + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz,"
2508 // + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz,"
2509 + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz,"
2510 + "TAX_PATH/shrunk.pdb.accession2taxid.gz";
2511
2512 /** For setting TAX_PATH, the root to taxonomy files */
2513 public static final String defaultTaxPath(){
2514 return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc;
2515 }
2516
2517 /** 16S consensus sequences per TaxID */
2518 public static final String default16SFile(){
2519 return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc;
2520 }
2521
2522 /** 18S consensus sequences per TaxID */
2523 public static final String default18SFile(){
2524 return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc;
2525 }
2526
2527 /** Path to all taxonomy files, substituted in to make specific file paths */
2528 public static String TAX_PATH=defaultTaxPath();
2529
2530 /** Location of gitable.int2d.gz for gi lookups */
2531 public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);}
2532 /** Location of tree.taxtree.gz */
2533 public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);}
2534
2535 //Use the prot.FULL.gz ncbi file.
2536 public static boolean protFull=false;
2537
2538 /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */
2539 public static final String defaultAccessionFile(){
2540 String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile;
2541 return s.replaceAll("TAX_PATH", TAX_PATH);
2542 }
2543 /** Location of patterns.txt, which holds information about observed accession string formats */
2544 public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);}
2545 /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */
2546 public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);}
2547 /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */
2548 public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);}
2549
2550 /** Screen output gets printed here */
2551 private static PrintStream outstream=System.out;
2552
2553 }