annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/TaxTree.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package tax;
jpayne@68 2
jpayne@68 3 import java.io.PrintStream;
jpayne@68 4 import java.io.Serializable;
jpayne@68 5 import java.util.ArrayList;
jpayne@68 6 import java.util.Arrays;
jpayne@68 7 import java.util.HashMap;
jpayne@68 8 import java.util.LinkedHashMap;
jpayne@68 9 import java.util.List;
jpayne@68 10 import java.util.regex.Pattern;
jpayne@68 11
jpayne@68 12 import fileIO.ByteFile;
jpayne@68 13 import fileIO.ReadWrite;
jpayne@68 14 import fileIO.TextFile;
jpayne@68 15 import shared.Parse;
jpayne@68 16 import shared.Parser;
jpayne@68 17 import shared.PreParser;
jpayne@68 18 import shared.Shared;
jpayne@68 19 import shared.Timer;
jpayne@68 20 import shared.Tools;
jpayne@68 21 import structures.ByteBuilder;
jpayne@68 22 import structures.IntHashMap;
jpayne@68 23 import structures.IntList;
jpayne@68 24 import structures.IntLongHashMap;
jpayne@68 25
jpayne@68 26 /**
jpayne@68 27 * Represents a taxonomic tree.
jpayne@68 28 * Usually just one of these needs to be created for a process.
jpayne@68 29 * Designed for NCBI's taxdmp.zip file contents.
jpayne@68 30 * @author Brian Bushnell
jpayne@68 31 * @date Mar 6, 2015
jpayne@68 32 *
jpayne@68 33 */
jpayne@68 34 public class TaxTree implements Serializable{
jpayne@68 35
jpayne@68 36 /**
jpayne@68 37 *
jpayne@68 38 */
jpayne@68 39 private static final long serialVersionUID = 5894416521711540017L;
jpayne@68 40
jpayne@68 41 /*--------------------------------------------------------------*/
jpayne@68 42 /*---------------- Main ----------------*/
jpayne@68 43 /*--------------------------------------------------------------*/
jpayne@68 44
jpayne@68 45 /**
jpayne@68 46 * Code entrance from the command line.
jpayne@68 47 * This is not called normally, only when converting NCBI text files
jpayne@68 48 * into a binary representation and writing it to disk.
jpayne@68 49 * @param args Command line arguments
jpayne@68 50 */
jpayne@68 51 public static void main(String[] args){
jpayne@68 52
jpayne@68 53 {//Preparse block for help, config files, and outstream
jpayne@68 54 PreParser pp=new PreParser(args, outstream, null, false);
jpayne@68 55 args=pp.args;
jpayne@68 56 outstream=pp.outstream;
jpayne@68 57 }
jpayne@68 58
jpayne@68 59 assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n";
jpayne@68 60 ReadWrite.USE_UNPIGZ=true;
jpayne@68 61 ReadWrite.USE_PIGZ=true;
jpayne@68 62 ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9);
jpayne@68 63 ReadWrite.PIGZ_BLOCKSIZE=256;
jpayne@68 64 ReadWrite.PIGZ_ITERATIONS=60;
jpayne@68 65
jpayne@68 66 Timer t=new Timer();
jpayne@68 67 TaxTree tree=new TaxTree(args);
jpayne@68 68 t.stop();
jpayne@68 69
jpayne@68 70 outstream.println("Retained "+tree.nodeCount+" nodes:");
jpayne@68 71
jpayne@68 72 for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){
jpayne@68 73 outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]);
jpayne@68 74 if(verbose){
jpayne@68 75 int lim=10;
jpayne@68 76 for(int j=0; j<lim && j<tree.treeLevelsExtended[i].length; j++){
jpayne@68 77 TaxNode n=tree.treeLevelsExtended[i][j];
jpayne@68 78 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]);
jpayne@68 79 }
jpayne@68 80 for(int j=tree.treeLevelsExtended[i].length-lim; j<tree.treeLevelsExtended[i].length; j++){
jpayne@68 81 if(j>=lim){
jpayne@68 82 TaxNode n=tree.treeLevelsExtended[i][j];
jpayne@68 83 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]);
jpayne@68 84 }
jpayne@68 85 }
jpayne@68 86 }
jpayne@68 87 outstream.println();
jpayne@68 88 }
jpayne@68 89
jpayne@68 90
jpayne@68 91 outstream.println();
jpayne@68 92 outstream.println("Time: \t"+t);
jpayne@68 93
jpayne@68 94 if(args.length>2){//Write a tree
jpayne@68 95 ReadWrite.write(tree, args[3], true);
jpayne@68 96 }
jpayne@68 97 }
jpayne@68 98
jpayne@68 99 /** Parse arguments from the command line */
jpayne@68 100 private Parser parse(String[] args){
jpayne@68 101
jpayne@68 102 //Create a parser object
jpayne@68 103 Parser parser=new Parser();
jpayne@68 104
jpayne@68 105 //Set any necessary Parser defaults here
jpayne@68 106 //parser.foo=bar;
jpayne@68 107
jpayne@68 108 //Parse each argument
jpayne@68 109 for(int i=0; i<args.length; i++){
jpayne@68 110 String arg=args[i];
jpayne@68 111
jpayne@68 112 //Break arguments into their constituent parts, in the form of "a=b"
jpayne@68 113 String[] split=arg.split("=");
jpayne@68 114 String a=split[0].toLowerCase();
jpayne@68 115 String b=split.length>1 ? split[1] : null;
jpayne@68 116 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
jpayne@68 117
jpayne@68 118 if(a.equals("verbose")){
jpayne@68 119 verbose=Parse.parseBoolean(b);
jpayne@68 120 }else if(a.equals("parse_flag_goes_here")){
jpayne@68 121 long fake_variable=Parse.parseKMG(b);
jpayne@68 122 //Set a variable here
jpayne@68 123 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
jpayne@68 124 //do nothing
jpayne@68 125 }else if(i>3){
jpayne@68 126 outstream.println("Unknown parameter "+args[i]);
jpayne@68 127 assert(false) : "Unknown parameter "+args[i];
jpayne@68 128 }
jpayne@68 129 }
jpayne@68 130
jpayne@68 131 return parser;
jpayne@68 132 }
jpayne@68 133
jpayne@68 134 /*--------------------------------------------------------------*/
jpayne@68 135 /*---------------- Initialization ----------------*/
jpayne@68 136 /*--------------------------------------------------------------*/
jpayne@68 137
jpayne@68 138 /*--------------------------------------------------------------*/
jpayne@68 139 /*---------------- Constructors ----------------*/
jpayne@68 140 /*--------------------------------------------------------------*/
jpayne@68 141
jpayne@68 142 /**
jpayne@68 143 * Constructor using filenames from command line arguments, in the format of:
jpayne@68 144 * {names, nodes, merged}
jpayne@68 145 * @param args Command line arguments
jpayne@68 146 */
jpayne@68 147 private TaxTree(String[] args){
jpayne@68 148 this(args[0], args[1], args[2], args);
jpayne@68 149 }
jpayne@68 150
jpayne@68 151 /**
jpayne@68 152 * @param namesFile NCBI names.txt
jpayne@68 153 * @param nodesFile NCBI nodes.txt
jpayne@68 154 * @param mergedFile NCBI merged.txt
jpayne@68 155 * @param args
jpayne@68 156 */
jpayne@68 157 private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){
jpayne@68 158
jpayne@68 159 if(args!=null) {
jpayne@68 160 Parser parser=parse(args);
jpayne@68 161 }
jpayne@68 162
jpayne@68 163 nodes=getNames(namesFile);
jpayne@68 164 getNodes(nodesFile, nodes);
jpayne@68 165
jpayne@68 166 mergedMap=getMerged(mergedFile);
jpayne@68 167
jpayne@68 168 countChildren();
jpayne@68 169 outstream.println("Counted children.");
jpayne@68 170 int rounds=percolate();
jpayne@68 171 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
jpayne@68 172
jpayne@68 173 if(assignStrains){
jpayne@68 174 assignStrains();
jpayne@68 175 rounds=percolate();
jpayne@68 176 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
jpayne@68 177 }
jpayne@68 178
jpayne@68 179 if(simplify){
jpayne@68 180 if(verbose){outstream.println("Simplifying.");}
jpayne@68 181 int removed=simplify(nodes);
jpayne@68 182 if(verbose){outstream.println("Removed "+removed+" nodes.");}
jpayne@68 183 rounds=percolate();
jpayne@68 184 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
jpayne@68 185 }
jpayne@68 186 int errors=test(nodes);
jpayne@68 187 // assert(errors==0); //Not possible since the tree is wrong.
jpayne@68 188 if(errors>0) {
jpayne@68 189 System.err.println("Found "+errors+" errors in tree.");
jpayne@68 190 }
jpayne@68 191
jpayne@68 192 for(TaxNode n : nodes){
jpayne@68 193 if(n!=null){
jpayne@68 194 nodesPerLevel[n.level]++;
jpayne@68 195 nodesPerLevelExtended[n.levelExtended]++;
jpayne@68 196 }
jpayne@68 197 }
jpayne@68 198
jpayne@68 199 // for(int i=0; i<nodesPerLevel.length; i++){
jpayne@68 200 // treeLevels[i]=new TaxNode[nodesPerLevel[i]];
jpayne@68 201 // }
jpayne@68 202 for(int i=0; i<nodesPerLevelExtended.length; i++){
jpayne@68 203 treeLevelsExtended[i]=new TaxNode[nodesPerLevelExtended[i]];
jpayne@68 204 }
jpayne@68 205
jpayne@68 206 // {
jpayne@68 207 // int[] temp=new int[nodesPerLevel.length];
jpayne@68 208 // for(TaxNode n : nodes){
jpayne@68 209 // if(n!=null){
jpayne@68 210 // int level=n.level;
jpayne@68 211 // treeLevels[level][temp[level]]=n;
jpayne@68 212 // temp[level]++;
jpayne@68 213 // }
jpayne@68 214 // }
jpayne@68 215 // }
jpayne@68 216
jpayne@68 217 {
jpayne@68 218 int[] temp=new int[nodesPerLevelExtended.length];
jpayne@68 219 for(TaxNode n : nodes){
jpayne@68 220 if(n!=null){
jpayne@68 221 int level=n.levelExtended;
jpayne@68 222 treeLevelsExtended[level][temp[level]]=n;
jpayne@68 223 temp[level]++;
jpayne@68 224 }
jpayne@68 225 }
jpayne@68 226 }
jpayne@68 227 nodeCount=(int)Tools.sum(nodesPerLevelExtended);
jpayne@68 228
jpayne@68 229 }
jpayne@68 230
jpayne@68 231 /*--------------------------------------------------------------*/
jpayne@68 232 /*--------------- Loaders ----------------*/
jpayne@68 233 /*--------------------------------------------------------------*/
jpayne@68 234
jpayne@68 235
jpayne@68 236 /**
jpayne@68 237 * Load a tax tree from disk.
jpayne@68 238 * @param taxTreeFile Serialized TaxTree.
jpayne@68 239 * @param hashNames Hash nodes using names as keys
jpayne@68 240 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens
jpayne@68 241 * @return
jpayne@68 242 */
jpayne@68 243 public static final TaxTree loadTaxTree(String taxTreeFile, PrintStream outstream, boolean hashNames,
jpayne@68 244 boolean hashDotFormat){
jpayne@68 245 if(taxTreeFile==null){return null;}
jpayne@68 246 return loadTaxTree(taxTreeFile, null, null, null, outstream, hashNames, hashDotFormat);
jpayne@68 247 }
jpayne@68 248
jpayne@68 249 /**
jpayne@68 250 * Load a tax tree from disk, either from a binary tree file,
jpayne@68 251 * or from NCBI text files.
jpayne@68 252 * @param taxTreeFile Binary representation; mutually exclusive with other files.
jpayne@68 253 * @param taxNameFile NCBI names.txt
jpayne@68 254 * @param taxNodeFile NCBI nodes.txt
jpayne@68 255 * @param taxMergedFile NCBI merged.txt
jpayne@68 256 * @param hashNames Hash nodes using names as keys
jpayne@68 257 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens
jpayne@68 258 * @return The loaded tree
jpayne@68 259 */
jpayne@68 260 public static final TaxTree loadTaxTree(String taxTreeFile, String taxNameFile, String taxNodeFile,
jpayne@68 261 String taxMergedFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){
jpayne@68 262 if(taxTreeFile!=null || taxNodeFile==null){
jpayne@68 263 TaxTree tree=sharedTree(taxTreeFile, hashNames, hashDotFormat, outstream);
jpayne@68 264 if(tree!=null){return tree;}
jpayne@68 265 }
jpayne@68 266 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=defaultTreeFile();}
jpayne@68 267 assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files.";
jpayne@68 268 Timer t=new Timer();
jpayne@68 269 if(outstream!=null){outstream.print("\nLoading tax tree; ");}
jpayne@68 270 final TaxTree tree;
jpayne@68 271 if(taxTreeFile!=null){
jpayne@68 272 tree=ReadWrite.read(TaxTree.class, taxTreeFile, true);
jpayne@68 273 }else{
jpayne@68 274 tree=new TaxTree(taxNameFile, taxNodeFile, taxMergedFile, null);
jpayne@68 275 }
jpayne@68 276 t.stop();
jpayne@68 277 if(hashNames){
jpayne@68 278 if(outstream!=null){outstream.println("Hashing names.");}
jpayne@68 279 tree.hashNames(hashDotFormat);
jpayne@68 280 }
jpayne@68 281 if(outstream!=null){
jpayne@68 282 outstream.println("Time: \t"+t);
jpayne@68 283 Shared.printMemory();
jpayne@68 284 outstream.println();
jpayne@68 285 }
jpayne@68 286 if(ALLOW_SHARED_TREE){sharedTree=tree;}
jpayne@68 287 return tree;
jpayne@68 288 }
jpayne@68 289
jpayne@68 290 /*--------------------------------------------------------------*/
jpayne@68 291 /*--------------- Constructor Helpers ----------------*/
jpayne@68 292 /*--------------------------------------------------------------*/
jpayne@68 293
jpayne@68 294 /**
jpayne@68 295 * Finds unranked nodes in the archaeal and bacterial kingdoms.
jpayne@68 296 * If these are below species level, have a ranked parent,
jpayne@68 297 * and have no ranked children, they are assigned strain or substrain.
jpayne@68 298 */
jpayne@68 299 private void assignStrains(){
jpayne@68 300
jpayne@68 301 outstream.println("Assigning strains.");
jpayne@68 302 int strains=0, substrains=0;
jpayne@68 303 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed yet
jpayne@68 304 TaxNode archaea=getNode(ARCHAEA_ID);
jpayne@68 305 assert(bacteria.name.equalsIgnoreCase("Bacteria"));
jpayne@68 306 assert(archaea.name.equalsIgnoreCase("Archaea"));
jpayne@68 307
jpayne@68 308 ArrayList<TaxNode> bactList=new ArrayList<TaxNode>();
jpayne@68 309 ArrayList<TaxNode> archList=new ArrayList<TaxNode>();
jpayne@68 310 for(TaxNode tn : nodes){
jpayne@68 311 if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){
jpayne@68 312 if(descendsFrom(tn, bacteria)){
jpayne@68 313 bactList.add(tn);
jpayne@68 314 }else if(descendsFrom(tn, archaea)){
jpayne@68 315 archList.add(tn);
jpayne@68 316 }
jpayne@68 317 }
jpayne@68 318 }
jpayne@68 319
jpayne@68 320 ArrayList<TaxNode> prokList=new ArrayList<TaxNode>(bactList.size()+archList.size());
jpayne@68 321 prokList.addAll(bactList);
jpayne@68 322 prokList.addAll(archList);
jpayne@68 323
jpayne@68 324 for(TaxNode tn : prokList){
jpayne@68 325 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){
jpayne@68 326 TaxNode parent=nodes[tn.pid];
jpayne@68 327 if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){
jpayne@68 328 tn.levelExtended=STRAIN_E;
jpayne@68 329 tn.level=SUBSPECIES;
jpayne@68 330 tn.setOriginalLevel(STRAIN_E);
jpayne@68 331 strains++;
jpayne@68 332 }
jpayne@68 333 }
jpayne@68 334 }
jpayne@68 335
jpayne@68 336 // outstream.println("Assigned "+strains+" strains.");
jpayne@68 337 for(TaxNode tn : prokList){
jpayne@68 338 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){
jpayne@68 339 TaxNode parent=nodes[tn.pid];
jpayne@68 340 if(parent.levelExtended==STRAIN_E){
jpayne@68 341 tn.levelExtended=SUBSTRAIN_E;
jpayne@68 342 tn.level=SUBSPECIES;
jpayne@68 343 tn.setOriginalLevel(SUBSTRAIN_E);
jpayne@68 344 substrains++;
jpayne@68 345 }
jpayne@68 346 }
jpayne@68 347 }
jpayne@68 348 // outstream.println("Assigned "+substrains+" substrains.");
jpayne@68 349 }
jpayne@68 350
jpayne@68 351 @Deprecated
jpayne@68 352 private void assignStrainsOld(){
jpayne@68 353
jpayne@68 354 outstream.println("Assigning strains.");
jpayne@68 355 int strains=0, substrains=0;
jpayne@68 356 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed
jpayne@68 357 assert(bacteria.name.equalsIgnoreCase("Bacteria"));
jpayne@68 358 for(TaxNode tn : nodes){
jpayne@68 359 if(tn!=null && tn.originalLevel()==NO_RANK){
jpayne@68 360 TaxNode parent=nodes[tn.pid];
jpayne@68 361 if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){
jpayne@68 362 // nodesPerLevelExtended[STRAIN_E]++;
jpayne@68 363 // nodesPerLevelExtended[tn.levelExtended]--;
jpayne@68 364 tn.levelExtended=STRAIN_E;
jpayne@68 365 tn.level=SUBSPECIES;
jpayne@68 366 tn.setOriginalLevel(STRAIN_E);
jpayne@68 367 strains++;
jpayne@68 368 }
jpayne@68 369 }
jpayne@68 370 }
jpayne@68 371 // outstream.println("Assigned "+strains+" strains.");
jpayne@68 372 for(TaxNode tn : nodes){
jpayne@68 373 if(tn!=null && tn.originalLevel()==NO_RANK){
jpayne@68 374 TaxNode parent=nodes[tn.pid];
jpayne@68 375 if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){
jpayne@68 376 // nodesPerLevelExtended[SUBSTRAIN_E]++;
jpayne@68 377 // nodesPerLevelExtended[tn.levelExtended]--;
jpayne@68 378 tn.levelExtended=SUBSTRAIN_E;
jpayne@68 379 tn.level=SUBSPECIES;
jpayne@68 380 tn.setOriginalLevel(SUBSTRAIN_E);
jpayne@68 381 substrains++;
jpayne@68 382 }
jpayne@68 383 }
jpayne@68 384 }
jpayne@68 385 // outstream.println("Assigned "+substrains+" substrains.");
jpayne@68 386 }
jpayne@68 387
jpayne@68 388 /*--------------------------------------------------------------*/
jpayne@68 389 /*---------------- Construction ----------------*/
jpayne@68 390 /*--------------------------------------------------------------*/
jpayne@68 391
jpayne@68 392 /**
jpayne@68 393 * Create tax nodes using names in the designated file.
jpayne@68 394 * @param fname NCBI names.txt
jpayne@68 395 * @return Array of created nodes, where array[x] contains the node with TaxID x.
jpayne@68 396 */
jpayne@68 397 private static TaxNode[] getNames(String fname){
jpayne@68 398 ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000);
jpayne@68 399 int max=0;
jpayne@68 400
jpayne@68 401 TextFile tf=new TextFile(fname, false);
jpayne@68 402 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
jpayne@68 403 if(s.contains("scientific name")){
jpayne@68 404 String[] split=delimiter.split(s, 3);
jpayne@68 405 assert(split.length==3) : s;
jpayne@68 406 int id=Integer.parseInt(split[0]);
jpayne@68 407 String name=split[1];
jpayne@68 408 if(id==1 && name.equalsIgnoreCase("root")){name="Life";}
jpayne@68 409 max=Tools.max(max, id);
jpayne@68 410 list.add(new TaxNode(id, name));
jpayne@68 411 }
jpayne@68 412 }
jpayne@68 413
jpayne@68 414 TaxNode[] nodes=new TaxNode[max+1];
jpayne@68 415 for(TaxNode n : list){
jpayne@68 416 assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n;
jpayne@68 417 nodes[n.id]=n;
jpayne@68 418 }
jpayne@68 419
jpayne@68 420 return nodes;
jpayne@68 421 }
jpayne@68 422
jpayne@68 423 /**
jpayne@68 424 * Parses names file a second time to fill in additional information.
jpayne@68 425 * Should really be merged into getNames.
jpayne@68 426 * @TODO Merge into getNames
jpayne@68 427 */
jpayne@68 428 private static TaxNode[] getNodes(String fname, TaxNode[] nodes){
jpayne@68 429
jpayne@68 430 int max=0;
jpayne@68 431
jpayne@68 432 LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>();
jpayne@68 433
jpayne@68 434 TextFile tf=new TextFile(fname, false);
jpayne@68 435 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
jpayne@68 436 String[] split=delimiter.split(s, 4);
jpayne@68 437 assert(split.length==4) : s;
jpayne@68 438 int id=-1, pid=-1, level=-1, levelExtended=-1;
jpayne@68 439
jpayne@68 440 id=Integer.parseInt(split[0]);
jpayne@68 441 try {
jpayne@68 442 pid=Integer.parseInt(split[1]);
jpayne@68 443 } catch (NumberFormatException e) {
jpayne@68 444 // TODO Auto-generated catch block
jpayne@68 445 e.printStackTrace();
jpayne@68 446 System.err.println("Bad line: "+s+"\n"+Arrays.toString(split));
jpayne@68 447 }
jpayne@68 448 boolean alt=false;
jpayne@68 449 {
jpayne@68 450 String key=split[2];
jpayne@68 451 Integer obj0=levelMap.get(key);
jpayne@68 452 Integer obj=levelMapExtended.get(key);
jpayne@68 453 assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split);
jpayne@68 454
jpayne@68 455 if(obj0==null){
jpayne@68 456 obj0=altLevelMap.get(key);
jpayne@68 457 alt=true;
jpayne@68 458 }
jpayne@68 459 if(obj0!=null){
jpayne@68 460 level=obj0;
jpayne@68 461 levelExtended=obj;
jpayne@68 462 if(id==pid){
jpayne@68 463 level=LIFE;
jpayne@68 464 levelExtended=LIFE_E;
jpayne@68 465 alt=false;
jpayne@68 466 }
jpayne@68 467 }else{
jpayne@68 468 if(id==pid){
jpayne@68 469 level=LIFE;
jpayne@68 470 levelExtended=LIFE_E;
jpayne@68 471 alt=false;
jpayne@68 472 }else{
jpayne@68 473 int[] count=oddNames.get(key);
jpayne@68 474 if(count==null){
jpayne@68 475 count=new int[1];
jpayne@68 476 oddNames.put(key, count);
jpayne@68 477 }
jpayne@68 478 count[0]++;
jpayne@68 479 }
jpayne@68 480 }
jpayne@68 481 }
jpayne@68 482 max=Tools.max(max, id);
jpayne@68 483 TaxNode n=nodes[id];
jpayne@68 484 assert(n!=null && n.pid<0) : n+" -> "+s;
jpayne@68 485 n.pid=pid;
jpayne@68 486 n.level=level;
jpayne@68 487 n.levelExtended=levelExtended;
jpayne@68 488 n.setOriginalLevel(levelExtended);
jpayne@68 489 n.setCanonical(!alt);
jpayne@68 490 assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n";
jpayne@68 491 }
jpayne@68 492
jpayne@68 493 if(oddNames.size()>0){
jpayne@68 494 outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:");
jpayne@68 495 if(verbose){
jpayne@68 496 for(String s : oddNames.keySet()){
jpayne@68 497 outstream.println(oddNames.get(s)[0]+"\t"+s);
jpayne@68 498 }
jpayne@68 499 }
jpayne@68 500 }
jpayne@68 501
jpayne@68 502 return nodes;
jpayne@68 503 }
jpayne@68 504
jpayne@68 505 /**
jpayne@68 506 * Count child nodes of each node.
jpayne@68 507 * This can be used to size arrays or determine which nodes are leaves.
jpayne@68 508 */
jpayne@68 509 private void countChildren(){
jpayne@68 510 for(TaxNode child : nodes){
jpayne@68 511 if(child!=null && child.pid!=child.id){
jpayne@68 512 TaxNode parent=getNode(child.pid);
jpayne@68 513 if(parent!=child){parent.numChildren++;}
jpayne@68 514 }
jpayne@68 515 }
jpayne@68 516 }
jpayne@68 517
jpayne@68 518 //TODO - This could be finished in 2 passes using the childTable.
jpayne@68 519 /**
jpayne@68 520 * Fill derived fields minParentLevelExtended and maxChildLevelExtended
jpayne@68 521 * by percolating information through the tree until a fixpoint is reached.
jpayne@68 522 * @TODO This could be finished in 2 passes using the childTable.
jpayne@68 523 * @return Number of rounds required to reach fixpoint.
jpayne@68 524 */
jpayne@68 525 private int percolate(){
jpayne@68 526 boolean changed=true;
jpayne@68 527 int rounds=0;
jpayne@68 528 while(changed){
jpayne@68 529 changed=false;
jpayne@68 530 rounds++;
jpayne@68 531 for(TaxNode child : nodes){
jpayne@68 532 if(child!=null && child.pid!=child.id){
jpayne@68 533 TaxNode parent=getNode(child.pid);
jpayne@68 534 changed=(child.discussWithParent(parent) | changed);
jpayne@68 535 }
jpayne@68 536 }
jpayne@68 537
jpayne@68 538 if(!changed){break;}
jpayne@68 539 changed=false;
jpayne@68 540 rounds++;
jpayne@68 541 for(int i=nodes.length-1; i>=0; i--){
jpayne@68 542 TaxNode child=nodes[i];
jpayne@68 543 if(child!=null && child.pid!=child.id){
jpayne@68 544 TaxNode parent=getNode(child.pid);
jpayne@68 545 changed=(child.discussWithParent(parent) | changed);
jpayne@68 546 }
jpayne@68 547 }
jpayne@68 548 }
jpayne@68 549 return rounds;
jpayne@68 550 }
jpayne@68 551
jpayne@68 552 /**
jpayne@68 553 * Load nodes into the nameMap and nameMapLower, mapped to their names.
jpayne@68 554 * @param genusDotSpecies Also hash abbreviations such as E.coli.
jpayne@68 555 */
jpayne@68 556 public synchronized void hashNames(boolean genusDotSpecies){
jpayne@68 557 if(nameMap!=null){return;}
jpayne@68 558 assert(nameMap==null);
jpayne@68 559 assert(nameMapLower==null);
jpayne@68 560 final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN));
jpayne@68 561 nameMap=new HashMap<String, ArrayList<TaxNode>>(size);
jpayne@68 562 nameMapLower=new HashMap<String, ArrayList<TaxNode>>(size);
jpayne@68 563
jpayne@68 564 //Hash the names, both lowercase and uppercase
jpayne@68 565 for(TaxNode n : nodes){
jpayne@68 566 if(n!=null){
jpayne@68 567 String name=n.name;
jpayne@68 568 if(name.indexOf('_')>=0){
jpayne@68 569 name=name.replace('_', ' ').trim();
jpayne@68 570 }
jpayne@68 571 if(name!=null && !name.equals("environmental samples")){
jpayne@68 572 {
jpayne@68 573 ArrayList<TaxNode> list=nameMap.get(name);
jpayne@68 574 if(list==null){
jpayne@68 575 list=new ArrayList<TaxNode>();
jpayne@68 576 nameMap.put(name, list);
jpayne@68 577 }
jpayne@68 578 list.add(n);
jpayne@68 579 }
jpayne@68 580 {
jpayne@68 581 String lc=name.toLowerCase();
jpayne@68 582 ArrayList<TaxNode> list=nameMapLower.get(lc);
jpayne@68 583 if(list==null){
jpayne@68 584 list=new ArrayList<TaxNode>();
jpayne@68 585 nameMapLower.put(lc, list);
jpayne@68 586 }
jpayne@68 587 list.add(n);
jpayne@68 588 }
jpayne@68 589 }
jpayne@68 590 }
jpayne@68 591 }
jpayne@68 592
jpayne@68 593 //Hash G.species versions of the names, both lowercase and uppercase
jpayne@68 594 if(genusDotSpecies){
jpayne@68 595 ByteBuilder bb=new ByteBuilder(64);
jpayne@68 596 for(TaxNode n : nodes){
jpayne@68 597 if(n!=null && n.levelExtended==SPECIES_E){
jpayne@68 598 String name=n.name;
jpayne@68 599 if(name.indexOf('_')>=0){
jpayne@68 600 name=name.replace('_', ' ').trim();
jpayne@68 601 }
jpayne@68 602 if(name!=null && !name.equals("environmental samples")){
jpayne@68 603 final String dotFormat=dotFormat(name, bb);
jpayne@68 604 if(dotFormat!=null){
jpayne@68 605 {
jpayne@68 606 ArrayList<TaxNode> list=nameMap.get(dotFormat);
jpayne@68 607 if(list==null){
jpayne@68 608 list=new ArrayList<TaxNode>();
jpayne@68 609 nameMap.put(dotFormat, list);
jpayne@68 610 }
jpayne@68 611 list.add(n);
jpayne@68 612 }
jpayne@68 613 {
jpayne@68 614 String lc=dotFormat.toLowerCase();
jpayne@68 615 ArrayList<TaxNode> list=nameMapLower.get(lc);
jpayne@68 616 if(list==null){
jpayne@68 617 list=new ArrayList<TaxNode>();
jpayne@68 618 nameMapLower.put(lc, list);
jpayne@68 619 }
jpayne@68 620 list.add(n);
jpayne@68 621 }
jpayne@68 622 }
jpayne@68 623 }
jpayne@68 624 }
jpayne@68 625 }
jpayne@68 626 }
jpayne@68 627 }
jpayne@68 628
jpayne@68 629 /**
jpayne@68 630 * Generate the "dot format" name of a node.
jpayne@68 631 * For example, transform "Homo sapiens" to "H.sapiens"
jpayne@68 632 * @param name Node name
jpayne@68 633 * @param buffer A ByteBuilder that may be modified
jpayne@68 634 * @return Dot format
jpayne@68 635 */
jpayne@68 636 private static String dotFormat(String name, ByteBuilder buffer){
jpayne@68 637 if(name==null || name.indexOf('.')>=0){return null;}
jpayne@68 638 final int firstSpace=name.indexOf(' ');
jpayne@68 639 if(firstSpace<0 || firstSpace>=name.length()-1){return null;}
jpayne@68 640 final int lastSpace=name.lastIndexOf(' ');
jpayne@68 641 if(firstSpace!=lastSpace){return null;}
jpayne@68 642 final String a=name.substring(0, firstSpace);
jpayne@68 643 final String b=name.substring(lastSpace+1);
jpayne@68 644 final char ca=a.charAt(0);
jpayne@68 645 final char cb=b.charAt(0);
jpayne@68 646 if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;}
jpayne@68 647 if(buffer==null){buffer=new ByteBuilder(2+b.length());}
jpayne@68 648 else{buffer.clear();}
jpayne@68 649 buffer.append(ca).append('.').append(b);
jpayne@68 650 return buffer.toString();
jpayne@68 651 }
jpayne@68 652
jpayne@68 653 /**
jpayne@68 654 * Fill childMap, which maps nodes to their children.
jpayne@68 655 */
jpayne@68 656 public synchronized void hashChildren(){
jpayne@68 657 assert(childMap==null);
jpayne@68 658 int nodesWithChildren=0;
jpayne@68 659 for(TaxNode tn : nodes){
jpayne@68 660 if(tn!=null && tn.numChildren>0){nodesWithChildren++;}
jpayne@68 661 }
jpayne@68 662 childMap=new HashMap<TaxNode, ArrayList<TaxNode>>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN));
jpayne@68 663 for(TaxNode tn : nodes){
jpayne@68 664 if(tn!=null){
jpayne@68 665 if(tn.numChildren>0){
jpayne@68 666 childMap.put(tn, new ArrayList<TaxNode>(tn.numChildren));
jpayne@68 667 }
jpayne@68 668 }
jpayne@68 669 }
jpayne@68 670 for(TaxNode tn : nodes){
jpayne@68 671 if(tn!=null){
jpayne@68 672 if(tn.id!=tn.pid){
jpayne@68 673 ArrayList<TaxNode> list=childMap.get(getNode(tn.pid));
jpayne@68 674 if(list!=null){list.add(tn);}
jpayne@68 675 }
jpayne@68 676 }
jpayne@68 677 }
jpayne@68 678 }
jpayne@68 679
jpayne@68 680 /**
jpayne@68 681 * Fetch this node's children.
jpayne@68 682 * @param parent Node in question
jpayne@68 683 * @return List of child nodes
jpayne@68 684 */
jpayne@68 685 public ArrayList<TaxNode> getChildren(TaxNode parent){
jpayne@68 686 if(parent.numChildren<1){return null;}
jpayne@68 687 if(childMap!=null){return childMap.get(parent);}
jpayne@68 688 ArrayList<TaxNode> list=new ArrayList<TaxNode>(parent.numChildren);
jpayne@68 689 for(TaxNode tn : nodes){
jpayne@68 690 if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){
jpayne@68 691 list.add(tn);
jpayne@68 692 }
jpayne@68 693 }
jpayne@68 694 return list;
jpayne@68 695 }
jpayne@68 696
jpayne@68 697 /**
jpayne@68 698 * Load a map of old to new TaxIDs.
jpayne@68 699 * @param mergedFile NCBI merged.txt.
jpayne@68 700 * @return Map of old to new TaxIDs
jpayne@68 701 */
jpayne@68 702 private static IntHashMap getMerged(String mergedFile) {
jpayne@68 703 if(mergedFile==null){return null;}
jpayne@68 704 String[] lines=TextFile.toStringLines(mergedFile);
jpayne@68 705 if(lines.length<1){return null;}
jpayne@68 706 IntHashMap map=new IntHashMap((int)(lines.length*1.3));
jpayne@68 707 for(String line : lines){
jpayne@68 708 String[] split=delimiterTab.split(line);
jpayne@68 709 int a=Integer.parseInt(split[0]);
jpayne@68 710 int b=Integer.parseInt(split[2]);
jpayne@68 711 map.put(a, b);
jpayne@68 712 }
jpayne@68 713 return map;
jpayne@68 714 }
jpayne@68 715
jpayne@68 716 /**
jpayne@68 717 * Simplify the tree by assigning ranks to unranked nodes,
jpayne@68 718 * where possible, through inference.
jpayne@68 719 * Optionally removes unranked nodes based on the skipNorank field.
jpayne@68 720 * @param nodes Array of all TaxNodes.
jpayne@68 721 * @return Number of nodes removed.
jpayne@68 722 */
jpayne@68 723 private int simplify(TaxNode nodes[]){
jpayne@68 724
jpayne@68 725 int failed=test(nodes);
jpayne@68 726
jpayne@68 727 int removed=0;
jpayne@68 728 int reassigned=0;
jpayne@68 729
jpayne@68 730 if(reassign){
jpayne@68 731 boolean changed=true;
jpayne@68 732 int changedCount=0;
jpayne@68 733 while(changed){
jpayne@68 734 changed=false;
jpayne@68 735 for(int i=0; i<nodes.length; i++){
jpayne@68 736 TaxNode n=nodes[i];
jpayne@68 737 if(n!=null && n.levelExtended<1){
jpayne@68 738 int pid=n.pid;
jpayne@68 739 TaxNode parent=nodes[pid];
jpayne@68 740 assert(parent!=null) : n;
jpayne@68 741 if(n.maxDescendantLevelIncludingSelf()<SUBSPECIES_E &&
jpayne@68 742 parent.minAncestorLevelIncludingSelf()<=SPECIES_E && parent.minAncestorLevelIncludingSelf()>=SUBSPECIES_E){
jpayne@68 743 // if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){
jpayne@68 744 changed=true;
jpayne@68 745 n.levelExtended=SUBSPECIES_E;
jpayne@68 746 n.level=SUBSPECIES;
jpayne@68 747 changedCount++;
jpayne@68 748 }
jpayne@68 749 }
jpayne@68 750 }
jpayne@68 751 }
jpayne@68 752 System.err.println("Assigned levels to "+changedCount+" unranked nodes.");
jpayne@68 753 }
jpayne@68 754
jpayne@68 755
jpayne@68 756 if(skipNorank){//Skip nodes with unknown taxa
jpayne@68 757 if(verbose){outstream.println("A0");}
jpayne@68 758
jpayne@68 759 for(int i=0; i<nodes.length; i++){
jpayne@68 760 TaxNode n=nodes[i];
jpayne@68 761 if(n!=null){
jpayne@68 762 int pid=n.pid;
jpayne@68 763 TaxNode parent=nodes[pid];
jpayne@68 764 assert(parent!=null) : n;
jpayne@68 765 assert(parent!=n || pid==1) : n+", "+pid;
jpayne@68 766 while(parent.levelExtended<1 && n.levelExtended>parent.levelExtended){
jpayne@68 767 //System.err.println("Reassigned from "+parent);
jpayne@68 768 assert(parent.id!=parent.pid);
jpayne@68 769 parent=nodes[parent.pid];
jpayne@68 770 n.pid=parent.id;
jpayne@68 771 reassigned++;
jpayne@68 772 }
jpayne@68 773 }
jpayne@68 774 }
jpayne@68 775
jpayne@68 776 for(int i=0; i<nodes.length; i++){
jpayne@68 777 if(nodes[i]!=null && nodes[i].levelExtended<0){
jpayne@68 778 System.err.println("Removed "+nodes[i]);
jpayne@68 779 nodes[i]=null;
jpayne@68 780 removed++;
jpayne@68 781 }
jpayne@68 782 }
jpayne@68 783 if(verbose){outstream.println("Skipped "+reassigned+" unranked parents, removed "+removed+" invalid nodes.");}
jpayne@68 784 }
jpayne@68 785
jpayne@68 786 if(inferRankLimit>0){//Infer level for unset nodes (from "no rank")
jpayne@68 787 if(verbose){outstream.println("A");}
jpayne@68 788 int changed=1;
jpayne@68 789 while(changed>0){
jpayne@68 790 changed=0;
jpayne@68 791 for(final TaxNode n : nodes){
jpayne@68 792 if(n!=null){
jpayne@68 793 if(n.levelExtended==0){
jpayne@68 794 TaxNode parent=nodes[n.pid];
jpayne@68 795 if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){
jpayne@68 796 n.levelExtended=Tools.max(1, parent.levelExtended-1);
jpayne@68 797 assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit);
jpayne@68 798 changed++;
jpayne@68 799 }
jpayne@68 800 }
jpayne@68 801 }
jpayne@68 802 }
jpayne@68 803 if(verbose){outstream.println("changed: "+changed);}
jpayne@68 804 }
jpayne@68 805
jpayne@68 806 // outstream.println("B");
jpayne@68 807 // for(TaxNode n : nodes){
jpayne@68 808 // if(n!=null && n.level==0){
jpayne@68 809 // n.level=-1;
jpayne@68 810 // }
jpayne@68 811 // }
jpayne@68 812 }
jpayne@68 813
jpayne@68 814 failed=test(nodes);
jpayne@68 815
jpayne@68 816 // if(reassign){//Skip nodes with duplicate taxa
jpayne@68 817 // if(verbose){outstream.println("D");}
jpayne@68 818 // int changed=1;
jpayne@68 819 // while(changed>0){
jpayne@68 820 // changed=0;
jpayne@68 821 // for(final TaxNode n : nodes){
jpayne@68 822 // if(n!=null){
jpayne@68 823 // TaxNode parent=nodes[n.pid];
jpayne@68 824 // TaxNode grandparent=nodes[parent.pid];
jpayne@68 825 // assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent;
jpayne@68 826 // assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent;
jpayne@68 827 //
jpayne@68 828 // while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) ||
jpayne@68 829 // n.level>parent.level || (n.level==parent.level))){
jpayne@68 830 // parent=grandparent;
jpayne@68 831 // grandparent=nodes[parent.pid];
jpayne@68 832 // n.pid=parent.id;
jpayne@68 833 // reassigned++;
jpayne@68 834 // changed++;
jpayne@68 835 // }
jpayne@68 836 // }
jpayne@68 837 // }
jpayne@68 838 // if(verbose){outstream.println("changed: "+changed);}
jpayne@68 839 // }
jpayne@68 840 // if(verbose){outstream.println("E");}
jpayne@68 841 // for(int i=0; i<nodes.length; i++){
jpayne@68 842 // if(nodes[i]!=null && nodes[i].level<0){
jpayne@68 843 // nodes[i]=null;
jpayne@68 844 // removed++;
jpayne@68 845 // }
jpayne@68 846 // }
jpayne@68 847 // }
jpayne@68 848
jpayne@68 849 failed=test(nodes);
jpayne@68 850
jpayne@68 851 if(verbose){outstream.println("F");}
jpayne@68 852 {//Ensure the tree is now clean
jpayne@68 853 for(int i=0; i<nodes.length; i++){
jpayne@68 854 TaxNode n=nodes[i];
jpayne@68 855 if(n!=null){
jpayne@68 856 TaxNode parent=nodes[n.pid];
jpayne@68 857 TaxNode grandparent=nodes[parent.pid];
jpayne@68 858 assert(n==parent || n.levelExtended<=parent.levelExtended || !n.canonical() || n.levelExtended<1 || parent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent;
jpayne@68 859 assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent;
jpayne@68 860 }
jpayne@68 861 }
jpayne@68 862 }
jpayne@68 863
jpayne@68 864 // if(verbose){System.err.println("Reassignments: "+reassigned);}
jpayne@68 865
jpayne@68 866 return removed;
jpayne@68 867 }
jpayne@68 868
jpayne@68 869 /*--------------------------------------------------------------*/
jpayne@68 870 /*---------------- Validation ----------------*/
jpayne@68 871 /*--------------------------------------------------------------*/
jpayne@68 872
jpayne@68 873 /**
jpayne@68 874 * Ensure tree has monotonically increasing (or nondescending) ranks.
jpayne@68 875 * @param nodes All TaxNodes.
jpayne@68 876 * @return Number of violations.
jpayne@68 877 */
jpayne@68 878 private static int test(TaxNode[] nodes){
jpayne@68 879 int failed=0;
jpayne@68 880 for(final TaxNode n : nodes){
jpayne@68 881 if(n!=null){
jpayne@68 882 TaxNode parent=nodes[n.pid];
jpayne@68 883 try {
jpayne@68 884 assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) :
jpayne@68 885 "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n"
jpayne@68 886 + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended;
jpayne@68 887 assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent;
jpayne@68 888 // assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical() || !parent.canonical()) : n+" -> "+parent;
jpayne@68 889 if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){
jpayne@68 890 if(verbose){outstream.println("Error: "+n+" -> "+parent);}
jpayne@68 891 failed++;
jpayne@68 892 }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){
jpayne@68 893 // if(verbose){outstream.println("Error: "+n+" -> "+parent);}
jpayne@68 894 // failed++;
jpayne@68 895 }
jpayne@68 896 assert(n!=parent || n.id<=1) : n;
jpayne@68 897 } catch (Throwable e) {
jpayne@68 898 // TODO Auto-generated catch block
jpayne@68 899 e.printStackTrace();
jpayne@68 900 failed++;
jpayne@68 901 }
jpayne@68 902 }
jpayne@68 903 }
jpayne@68 904 if(verbose || failed>0){outstream.println(failed+" nodes failed.");}
jpayne@68 905 return failed;
jpayne@68 906 }
jpayne@68 907
jpayne@68 908 /*--------------------------------------------------------------*/
jpayne@68 909 /*---------------- Print Methods ----------------*/
jpayne@68 910 /*--------------------------------------------------------------*/
jpayne@68 911
jpayne@68 912 /**
jpayne@68 913 * Format full name in semicolon format, e.g.
jpayne@68 914 * "SK:Bacteria;P:Protobacteria;..."
jpayne@68 915 * @param tn0 Base node
jpayne@68 916 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
jpayne@68 917 * @return Resultant String
jpayne@68 918 */
jpayne@68 919 public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){
jpayne@68 920 StringBuilder sb=new StringBuilder();
jpayne@68 921 if(tn0==null){return "Not found";}
jpayne@68 922 String semi="";
jpayne@68 923 ArrayList<TaxNode> list=toAncestors(tn0, skipNonCanonical);
jpayne@68 924 boolean addTaxLevel=true;
jpayne@68 925 for(int i=list.size()-1; i>=0; i--){
jpayne@68 926 sb.append(semi);
jpayne@68 927 TaxNode tn=list.get(i);
jpayne@68 928 if(tn.id!=LIFE_ID || list.size()==1){
jpayne@68 929 if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){
jpayne@68 930 sb.append(tn.levelToStringShort()).append(':');
jpayne@68 931 }
jpayne@68 932 sb.append(mononomial ? mononomial(tn) : tn.name);
jpayne@68 933 semi=";";
jpayne@68 934 }
jpayne@68 935 }
jpayne@68 936 return sb.toString();
jpayne@68 937 }
jpayne@68 938
jpayne@68 939 /**
jpayne@68 940 * Return a list of TaxIDs of all ancestors.
jpayne@68 941 * @param tn0 Base node
jpayne@68 942 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
jpayne@68 943 * @return List of TaxIDs.
jpayne@68 944 */
jpayne@68 945 public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){
jpayne@68 946 if(tn0==null){return null;}
jpayne@68 947 IntList list=new IntList(8);
jpayne@68 948
jpayne@68 949 TaxNode tn=tn0;
jpayne@68 950 while(tn!=null){
jpayne@68 951 if(!skipNonCanonical || tn.isSimple()){
jpayne@68 952 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);}
jpayne@68 953 }
jpayne@68 954 if(tn.pid==tn.id){break;}
jpayne@68 955 tn=getNode(tn.pid);
jpayne@68 956 }
jpayne@68 957 if(list.isEmpty()){list.add(tn0.id);}
jpayne@68 958 return list;
jpayne@68 959 }
jpayne@68 960
jpayne@68 961 /**
jpayne@68 962 * Return a list of all ancestors.
jpayne@68 963 * @param tn0 Base node
jpayne@68 964 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
jpayne@68 965 * @return List of ancestor nodes.
jpayne@68 966 */
jpayne@68 967 public ArrayList<TaxNode> toAncestors(final TaxNode tn0, boolean skipNonCanonical){
jpayne@68 968 if(tn0==null){return null;}
jpayne@68 969 ArrayList<TaxNode> list=new ArrayList<TaxNode>(8);
jpayne@68 970
jpayne@68 971 TaxNode tn=tn0;
jpayne@68 972 while(tn!=null){
jpayne@68 973 if(!skipNonCanonical || tn.isSimple()){
jpayne@68 974 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);}
jpayne@68 975 }
jpayne@68 976 if(tn.pid==tn.id){break;}
jpayne@68 977 tn=getNode(tn.pid);
jpayne@68 978 }
jpayne@68 979 if(list.isEmpty()){list.add(tn0);}
jpayne@68 980 return list;
jpayne@68 981 }
jpayne@68 982
jpayne@68 983 /**
jpayne@68 984 * Generate a path to the genome of an organism on the filesystem;
jpayne@68 985 * used by ExplodeTree. Intended for internal JGI use.
jpayne@68 986 * @param root Location of the exploded tree.
jpayne@68 987 * @return Path to a genome.
jpayne@68 988 */
jpayne@68 989 public String toDir(TaxNode node, String root){
jpayne@68 990 StringBuilder sb=new StringBuilder();
jpayne@68 991 if(root==null){root="";}
jpayne@68 992 sb.append(root);
jpayne@68 993 if(root.length()>0 && !root.endsWith("/")){sb.append('/');}
jpayne@68 994 IntList list=toAncestorIds(node, false);
jpayne@68 995 list.reverse();
jpayne@68 996 assert(list.get(0)==1) : list + "," +getNode(list.get(0));
jpayne@68 997 for(int i=0; i<list.size(); i++){
jpayne@68 998 sb.append(list.get(i));
jpayne@68 999 sb.append('/');
jpayne@68 1000 }
jpayne@68 1001 return sb.toString();
jpayne@68 1002 }
jpayne@68 1003
jpayne@68 1004 /*--------------------------------------------------------------*/
jpayne@68 1005 /*---------------- Outer Methods ----------------*/
jpayne@68 1006 /*--------------------------------------------------------------*/
jpayne@68 1007
jpayne@68 1008 /**
jpayne@68 1009 * Use various techniques to get a TaxID from an unknown String, such as parsing,
jpayne@68 1010 * name lookups, and accession translation.
jpayne@68 1011 * @param s String to process.
jpayne@68 1012 * @return Decoded TaxID.
jpayne@68 1013 */
jpayne@68 1014 public static int getID(String s){return GiToTaxid.getID(s);}
jpayne@68 1015
jpayne@68 1016 /**
jpayne@68 1017 * Use various techniques to get a TaxID from an unknown byte[], such as parsing,
jpayne@68 1018 * name lookups, and accession translation.
jpayne@68 1019 * @param s String to process.
jpayne@68 1020 * @return Decoded TaxID.
jpayne@68 1021 */
jpayne@68 1022 public static int getID(byte[] s){return GiToTaxid.getID(s);}
jpayne@68 1023
jpayne@68 1024 /** Return the lowest ancestor of the named node with taxonomic level at least minLevel */
jpayne@68 1025 public TaxNode getNode(String s, int minLevelExtended){
jpayne@68 1026 TaxNode tn=parseNodeFromHeader(s, true);
jpayne@68 1027 while(tn!=null && tn.levelExtended<minLevelExtended && tn.pid!=tn.id){
jpayne@68 1028 tn=getNode(tn.pid);
jpayne@68 1029 }
jpayne@68 1030 return tn;
jpayne@68 1031 }
jpayne@68 1032
jpayne@68 1033 /**
jpayne@68 1034 * Determine whether a node is a descendant of another.
jpayne@68 1035 * @param child Possible child TaxID.
jpayne@68 1036 * @param parent Possible parent TaxID.
jpayne@68 1037 * @return true iff child descends from parent.
jpayne@68 1038 */
jpayne@68 1039 public boolean descendsFrom(final int child, final int parent){
jpayne@68 1040 TaxNode cn=getNode(child), pn=getNode(parent);
jpayne@68 1041 assert(cn!=null) : "Invalid taxID: "+child;
jpayne@68 1042 assert(pn!=null) : "Invalid taxID: "+parent;
jpayne@68 1043 return descendsFrom(cn, pn);
jpayne@68 1044 }
jpayne@68 1045
jpayne@68 1046 /**
jpayne@68 1047 * Determine whether a node is a descendant of another.
jpayne@68 1048 * @param child Possible child node.
jpayne@68 1049 * @param parent Possible parent node.
jpayne@68 1050 * @return true iff child descends from parent.
jpayne@68 1051 */
jpayne@68 1052 public boolean descendsFrom(TaxNode child, TaxNode parent){
jpayne@68 1053 assert(child!=null && parent!=null) : "Null parameters.";
jpayne@68 1054 if(child==null || parent==null){return false;}
jpayne@68 1055
jpayne@68 1056 while(child!=parent && child.levelExtended<=parent.levelExtended && child.id!=child.pid){
jpayne@68 1057 child=getNode(child.pid);
jpayne@68 1058 }
jpayne@68 1059 return child==parent;
jpayne@68 1060 }
jpayne@68 1061
jpayne@68 1062 /**
jpayne@68 1063 * Determine whether an organism is classified as X.
jpayne@68 1064 * @param taxID taxID of organism.
jpayne@68 1065 * @param ancestorID taxID of possible ancestor.
jpayne@68 1066 * @return true if the organism is an X.
jpayne@68 1067 */
jpayne@68 1068 public boolean descendsFrom2(int taxID, final int ancestorID) {
jpayne@68 1069 TaxNode tn=getNode(taxID);
jpayne@68 1070 while(tn.id!=tn.pid){
jpayne@68 1071 if(tn.id==ancestorID){return true;}
jpayne@68 1072 tn=getNode(tn.pid);
jpayne@68 1073 }
jpayne@68 1074 return false;
jpayne@68 1075 }
jpayne@68 1076
jpayne@68 1077 /** Determine whether an organism is classified as a plant. */
jpayne@68 1078 public boolean isPlant(int taxID) {return descendsFrom2(taxID, VIRIDIPLANTAE_ID);}
jpayne@68 1079
jpayne@68 1080 /** Determine whether an organism is classified as an animal. */
jpayne@68 1081 public boolean isAnimal(int taxID) {return descendsFrom2(taxID, METAZOA_ID);}
jpayne@68 1082
jpayne@68 1083 /** Determine whether an organism is classified as a fungus. */
jpayne@68 1084 public boolean isFungus(int taxID) {return descendsFrom2(taxID, FUNGI_ID);}
jpayne@68 1085
jpayne@68 1086 /** Determine whether an organism is classified as a eukaryote. */
jpayne@68 1087 public boolean isEukaryote(int taxID) {return descendsFrom2(taxID, EUKARYOTA_ID);}
jpayne@68 1088
jpayne@68 1089 /** Determine whether an organism is classified as a prokaryote. */
jpayne@68 1090 public boolean isProkaryote(int taxID) {
jpayne@68 1091 TaxNode tn=getNode(taxID);
jpayne@68 1092 if(tn==null){
jpayne@68 1093 System.err.println("*** Warning: Can't find node "+taxID+" ***");
jpayne@68 1094 return false;
jpayne@68 1095 }
jpayne@68 1096 while(tn.id!=tn.pid){
jpayne@68 1097 if(tn.id==BACTERIA_ID || tn.id==ARCHAEA_ID){return true;}
jpayne@68 1098 tn=getNode(tn.pid);
jpayne@68 1099 }
jpayne@68 1100 return false;
jpayne@68 1101 }
jpayne@68 1102
jpayne@68 1103 /**
jpayne@68 1104 * Calculate the common ancestor of two nodes.
jpayne@68 1105 * @param a TaxID of a node.
jpayne@68 1106 * @param b TaxID of a node.
jpayne@68 1107 * @return Common ancestor ID of a and b.
jpayne@68 1108 */
jpayne@68 1109 public int commonAncestor(final int a, final int b){
jpayne@68 1110 TaxNode an=getNode(a), bn=getNode(b);
jpayne@68 1111 assert(an!=null) : "Invalid taxID: "+a;
jpayne@68 1112 assert(bn!=null) : "Invalid taxID: "+b;
jpayne@68 1113 TaxNode cn=commonAncestor(an, bn);
jpayne@68 1114 assert(cn!=null) : "No common ancestor: "+an+", "+bn;
jpayne@68 1115 if(cn==null){return -1;}
jpayne@68 1116 return cn.id;
jpayne@68 1117 }
jpayne@68 1118
jpayne@68 1119 /**
jpayne@68 1120 * Calculate the common ancestor of two nodes.
jpayne@68 1121 * @param a A node.
jpayne@68 1122 * @param b A node.
jpayne@68 1123 * @return Common ancestor of a and b.
jpayne@68 1124 */
jpayne@68 1125 public TaxNode commonAncestor(TaxNode a, TaxNode b){
jpayne@68 1126 assert(a!=null && b!=null) : "Null parameters.";
jpayne@68 1127 if(a==null){return b;}
jpayne@68 1128 if(b==null){return a;}
jpayne@68 1129
jpayne@68 1130 while(a!=b){
jpayne@68 1131 if(a.levelExtended<b.levelExtended){
jpayne@68 1132 a=getNode(a.pid);
jpayne@68 1133 }else{
jpayne@68 1134 b=getNode(b.pid);
jpayne@68 1135 }
jpayne@68 1136 }
jpayne@68 1137 return a;
jpayne@68 1138 }
jpayne@68 1139
jpayne@68 1140 /**
jpayne@68 1141 * Identify the highest ancestor of a node;
jpayne@68 1142 * this will presumably be "Life".
jpayne@68 1143 * @param a Node
jpayne@68 1144 * @return Highest ancestor
jpayne@68 1145 */
jpayne@68 1146 public TaxNode highestAncestor(TaxNode a){
jpayne@68 1147 assert(a!=null);
jpayne@68 1148 while(a.id!=a.pid){a=getNode(a.pid);}
jpayne@68 1149 return a;
jpayne@68 1150 }
jpayne@68 1151
jpayne@68 1152 /*--------------------------------------------------------------*/
jpayne@68 1153 /*---------------- Header Parsing ----------------*/
jpayne@68 1154 /*--------------------------------------------------------------*/
jpayne@68 1155
jpayne@68 1156 /**
jpayne@68 1157 * Determine the TaxID of a String,
jpayne@68 1158 * without a loaded TaxTree.
jpayne@68 1159 * This only works if the literal TaxID is embedded in the String.
jpayne@68 1160 * @param header Typically a sequence header
jpayne@68 1161 * @return Decoded TaxID, or -1 if unsuccessful
jpayne@68 1162 */
jpayne@68 1163 public static int parseHeaderStatic(String header){
jpayne@68 1164 if(header.length()<3){return -1;}
jpayne@68 1165 if(header.charAt(0)=='>'){header=header.substring(1);}
jpayne@68 1166 if(!header.startsWith("tid|")){return -1;}
jpayne@68 1167 int idx=3;
jpayne@68 1168 int idx2=header.indexOf('|', 4);
jpayne@68 1169 if(idx2<5){return -1;}
jpayne@68 1170 int id=-1;
jpayne@68 1171 try {
jpayne@68 1172 id=Parse.parseInt(header, idx+1, idx2);
jpayne@68 1173 // System.err.println("d"+", "+header.substring(idx+1, idx2));
jpayne@68 1174 } catch (Throwable e) {
jpayne@68 1175 // System.err.println("e"+", "+header.substring(idx+1, idx2));
jpayne@68 1176 //ignore
jpayne@68 1177 }
jpayne@68 1178 return id;
jpayne@68 1179 }
jpayne@68 1180
jpayne@68 1181 /**
jpayne@68 1182 * Determine the TaxID of a String.
jpayne@68 1183 * @param header Typically a sequence header
jpayne@68 1184 * @param bestEffort In some cases, try certain substrings if the name is not found.
jpayne@68 1185 * @return
jpayne@68 1186 */
jpayne@68 1187 public TaxNode parseNodeFromHeader(String header, boolean bestEffort){
jpayne@68 1188 if(header==null || header.length()<2){return null;}
jpayne@68 1189 if(header.charAt(0)=='>'){header=header.substring(1);}
jpayne@68 1190 TaxNode tn;
jpayne@68 1191 if(SILVA_MODE){
jpayne@68 1192 tn=getNodeSilva(header, bestEffort);
jpayne@68 1193 }else if(UNITE_MODE){
jpayne@68 1194 tn=getNodeUnite(header, bestEffort);
jpayne@68 1195 }else{
jpayne@68 1196 final char delimiter=ncbiHeaderDelimiter(header);
jpayne@68 1197 if(delimiter==' '){
jpayne@68 1198 tn=getNodeNewStyle(header);
jpayne@68 1199 }else{
jpayne@68 1200 tn=getNodeOldStyle(header, delimiter);
jpayne@68 1201 if(tn==null && delimiter=='|'){
jpayne@68 1202 // System.err.println("A: "+header);
jpayne@68 1203 int id=-1;
jpayne@68 1204 String[] split=header.split("\\|");
jpayne@68 1205 if(AccessionToTaxid.LOADED()){
jpayne@68 1206 for(int i=0; i<split.length && id<0; i++){//Try accessions first
jpayne@68 1207 if(AccessionToTaxid.isValidAccession(split[i])){
jpayne@68 1208 id=AccessionToTaxid.get(split[i]);
jpayne@68 1209 }
jpayne@68 1210 }
jpayne@68 1211 }
jpayne@68 1212 for(int i=0; i<split.length && id<0; i++){//Then names
jpayne@68 1213 id=parseNameToTaxid(split[i]);
jpayne@68 1214 }
jpayne@68 1215 // System.err.println("E: "+id);
jpayne@68 1216 if(id>=0){tn=getNode(id);}
jpayne@68 1217 // System.err.println("F: "+tn);
jpayne@68 1218 }
jpayne@68 1219 }
jpayne@68 1220 }
jpayne@68 1221 return tn;
jpayne@68 1222 }
jpayne@68 1223
jpayne@68 1224 /**
jpayne@68 1225 * Guess the delimiter character in a String;
jpayne@68 1226 * typically assumed to be '|', '~', or ' '.
jpayne@68 1227 */
jpayne@68 1228 public static char ncbiHeaderDelimiter(String header){
jpayne@68 1229 for(int i=0; i<header.length(); i++){
jpayne@68 1230 final char c=header.charAt(i);
jpayne@68 1231 if(c=='|' || c=='~'){
jpayne@68 1232 assert(i>0) : "i="+i+"; malformatted header '"+header+"'";
jpayne@68 1233 return c;
jpayne@68 1234 }else if(Character.isWhitespace(c)){
jpayne@68 1235 return ' ';
jpayne@68 1236 }
jpayne@68 1237 }
jpayne@68 1238 return ' ';
jpayne@68 1239 }
jpayne@68 1240
jpayne@68 1241 /**
jpayne@68 1242 * Parse a Silva header to a Node.
jpayne@68 1243 * @param s Silva header.
jpayne@68 1244 * @param bestEffort Try certain substrings if the name is not found.
jpayne@68 1245 * @return Node
jpayne@68 1246 */
jpayne@68 1247 TaxNode getNodeSilva(String s, boolean bestEffort){
jpayne@68 1248 if(s==null){return null;}
jpayne@68 1249 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){
jpayne@68 1250 return getNodeOldStyle(s, s.charAt(3));
jpayne@68 1251 }
jpayne@68 1252 String[] split=Tools.semiPattern.split(s);
jpayne@68 1253
jpayne@68 1254 int number=-1;
jpayne@68 1255 // final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast"));
jpayne@68 1256 // if(chloroplast){return null;}
jpayne@68 1257 for(int i=split.length-1; number<0 && i>=0; i--){
jpayne@68 1258 String last=split[i];
jpayne@68 1259 int paren=last.indexOf('(');
jpayne@68 1260 if(paren>=0){last=last.substring(0, paren);}
jpayne@68 1261 last=last.trim();
jpayne@68 1262
jpayne@68 1263 if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){
jpayne@68 1264 number=parseNameToTaxid(last);
jpayne@68 1265 }
jpayne@68 1266
jpayne@68 1267 if(number>=0){return getNode(number);}
jpayne@68 1268 else if(!bestEffort){break;}
jpayne@68 1269 }
jpayne@68 1270 return null;
jpayne@68 1271 }
jpayne@68 1272
jpayne@68 1273 /**
jpayne@68 1274 * Parse a Unite header to a Node.
jpayne@68 1275 * @param s Unite header.
jpayne@68 1276 * @param bestEffort Try certain substrings if the name is not found.
jpayne@68 1277 * @return Node
jpayne@68 1278 */
jpayne@68 1279 TaxNode getNodeUnite(String s, boolean bestEffort){
jpayne@68 1280 if(s==null){return null;}
jpayne@68 1281 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){
jpayne@68 1282 return getNodeOldStyle(s, s.charAt(3));
jpayne@68 1283 }
jpayne@68 1284 String[] split=Tools.pipePattern.split(s);
jpayne@68 1285
jpayne@68 1286 int number=-1;
jpayne@68 1287 String name=split[0];
jpayne@68 1288 String acc=split[1];
jpayne@68 1289 if(AccessionToTaxid.LOADED() && acc.length()>0){
jpayne@68 1290 number=AccessionToTaxid.get(acc);
jpayne@68 1291 }
jpayne@68 1292 if(number<1){
jpayne@68 1293 TaxNode tn=getNodeByName(name);
jpayne@68 1294 if(tn!=null){number=tn.id;}
jpayne@68 1295 }
jpayne@68 1296
jpayne@68 1297 if(number>=0){return getNode(number);}
jpayne@68 1298 return null;
jpayne@68 1299 }
jpayne@68 1300
jpayne@68 1301 /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */
jpayne@68 1302 private TaxNode getNodeOldStyle(final String s, char delimiter){
jpayne@68 1303 {
jpayne@68 1304 int index=s.indexOf(delimiter);
jpayne@68 1305 if(index<0){
jpayne@68 1306 delimiter='~';
jpayne@68 1307 index=s.indexOf(delimiter);
jpayne@68 1308 if(index<0){
jpayne@68 1309 delimiter='_';
jpayne@68 1310 index=s.indexOf(delimiter);
jpayne@68 1311 }
jpayne@68 1312 }
jpayne@68 1313 int number=-1;
jpayne@68 1314
jpayne@68 1315 Throwable e=null;
jpayne@68 1316
jpayne@68 1317 if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){
jpayne@68 1318 // System.err.println("Parsing gi number.");
jpayne@68 1319
jpayne@68 1320 if(GiToTaxid.isInitialized()){
jpayne@68 1321 try {
jpayne@68 1322 number=GiToTaxid.parseGiToTaxid(s, delimiter);
jpayne@68 1323 } catch (Throwable e2) {
jpayne@68 1324 e=e2;
jpayne@68 1325 }
jpayne@68 1326 }else{
jpayne@68 1327 assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s;
jpayne@68 1328 }
jpayne@68 1329 // if(number!=-1){System.err.println("number="+number);}
jpayne@68 1330 }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){
jpayne@68 1331 // System.err.println("Parsing ncbi number.");
jpayne@68 1332 number=GiToTaxid.parseTaxidNumber(s, delimiter);
jpayne@68 1333 }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){
jpayne@68 1334 // System.err.println("Parsing ncbi number.");
jpayne@68 1335 long img=parseDelimitedNumber(s, delimiter);
jpayne@68 1336 ImgRecord record=imgMap.get(img);
jpayne@68 1337 number=(record==null ? -1 : record.taxID);
jpayne@68 1338 }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete
jpayne@68 1339 // System.err.println("Parsing ncbi number.");
jpayne@68 1340 number=GiToTaxid.parseTaxidNumber(s, delimiter);
jpayne@68 1341 }
jpayne@68 1342
jpayne@68 1343 if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){
jpayne@68 1344 String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s));
jpayne@68 1345 if(AccessionToTaxid.LOADED()){
jpayne@68 1346 number=parseAccessionToTaxid(split);
jpayne@68 1347 }
jpayne@68 1348 if(number<0){
jpayne@68 1349 number=parseHeaderNameToTaxid(split);
jpayne@68 1350 }
jpayne@68 1351 }
jpayne@68 1352
jpayne@68 1353 if(number<0 && e!=null){
jpayne@68 1354 assert(false) : e;
jpayne@68 1355 throw new RuntimeException(e);
jpayne@68 1356 }
jpayne@68 1357
jpayne@68 1358 //TaxServer code could go here...
jpayne@68 1359
jpayne@68 1360 if(number>=0){return getNode(number);}
jpayne@68 1361 }
jpayne@68 1362 if(verbose){System.err.println("Can't process name "+s);}
jpayne@68 1363 if(Tools.isDigit(s.charAt(0)) && s.length()<=9){
jpayne@68 1364 try {
jpayne@68 1365 return getNode(Integer.parseInt(s));
jpayne@68 1366 } catch (NumberFormatException e) {
jpayne@68 1367 //ignore
jpayne@68 1368 }
jpayne@68 1369 }
jpayne@68 1370 return null;
jpayne@68 1371 }
jpayne@68 1372
jpayne@68 1373 /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */
jpayne@68 1374 static long parseDelimitedNumber(String s, char delimiter){
jpayne@68 1375 if(s==null){return -1;}
jpayne@68 1376 int i=0;
jpayne@68 1377 while(i<s.length() && s.charAt(i)!=delimiter){i++;}
jpayne@68 1378 i++;
jpayne@68 1379 if(i>=s.length() || !Tools.isDigit(s.charAt(i))){return -1;}
jpayne@68 1380
jpayne@68 1381 long number=0;
jpayne@68 1382 while(i<s.length()){
jpayne@68 1383 char c=s.charAt(i);
jpayne@68 1384 if(c==delimiter || c==' ' || c=='\t'){break;}
jpayne@68 1385 assert(Tools.isDigit(c)) : c+"\n"+s;
jpayne@68 1386 number=(number*10)+(c-'0');
jpayne@68 1387 i++;
jpayne@68 1388 }
jpayne@68 1389 return number;
jpayne@68 1390 }
jpayne@68 1391
jpayne@68 1392 /** Parses sequence headers using NCBI's current header system, with Accessions. */
jpayne@68 1393 private TaxNode getNodeNewStyle(final String s){
jpayne@68 1394
jpayne@68 1395 int space=s.indexOf(' ');
jpayne@68 1396 int number=-1;
jpayne@68 1397
jpayne@68 1398 if(AccessionToTaxid.LOADED()){
jpayne@68 1399 if(space>0){
jpayne@68 1400 number=AccessionToTaxid.get(s.substring(0, space));
jpayne@68 1401 }else{
jpayne@68 1402 number=AccessionToTaxid.get(s);
jpayne@68 1403 }
jpayne@68 1404 }
jpayne@68 1405
jpayne@68 1406 if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){
jpayne@68 1407 try {
jpayne@68 1408 return getNode(Integer.parseInt(s));
jpayne@68 1409 } catch (NumberFormatException e) {
jpayne@68 1410 //ignore
jpayne@68 1411 }
jpayne@68 1412 }
jpayne@68 1413
jpayne@68 1414 if(number<0 && space>0){
jpayne@68 1415 number=parseNameToTaxid(s.substring(space+1));
jpayne@68 1416 }
jpayne@68 1417
jpayne@68 1418 if(number>-1){return getNode(number);}
jpayne@68 1419 if(space<0 && s.indexOf('_')>0){
jpayne@68 1420 return getNodeNewStyle(s.replace('_', ' '));
jpayne@68 1421 }
jpayne@68 1422 return null;
jpayne@68 1423 }
jpayne@68 1424
jpayne@68 1425 /**
jpayne@68 1426 * For parsing old-style NCBI headers.
jpayne@68 1427 */
jpayne@68 1428 public int parseAccessionToTaxid(String[] split){
jpayne@68 1429 if(split.length<4){
jpayne@68 1430 return -1;
jpayne@68 1431 }
jpayne@68 1432 int ncbi=AccessionToTaxid.get(split[3]);
jpayne@68 1433 return ncbi;
jpayne@68 1434 }
jpayne@68 1435
jpayne@68 1436 /**
jpayne@68 1437 * For parsing old-style NCBI headers.
jpayne@68 1438 */
jpayne@68 1439 public int parseHeaderNameToTaxid(String[] split){
jpayne@68 1440 if(split.length<5){
jpayne@68 1441 return -1;
jpayne@68 1442 }
jpayne@68 1443 return parseNameToTaxid(split[4]);
jpayne@68 1444 }
jpayne@68 1445
jpayne@68 1446 /**
jpayne@68 1447 * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens").
jpayne@68 1448 * If multiple nodes share the same name, returns the first; to get the full list,
jpayne@68 1449 * use getNodesByNameExtended.
jpayne@68 1450 * @param name Organism name.
jpayne@68 1451 * @return Organism TaxID, or -1 if not found.
jpayne@68 1452 */
jpayne@68 1453 public int parseNameToTaxid(String name){
jpayne@68 1454 // assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size());
jpayne@68 1455 List<TaxNode> list=null;
jpayne@68 1456
jpayne@68 1457 list=getNodesByNameExtended(name);
jpayne@68 1458
jpayne@68 1459 if(list==null || list.size()>1){return -1;}
jpayne@68 1460 return list.get(0).id;
jpayne@68 1461 }
jpayne@68 1462
jpayne@68 1463 /**
jpayne@68 1464 * Fetch nodes indicated by this name.
jpayne@68 1465 * @param name A taxonomic name delimited by space or underscore.
jpayne@68 1466 * @return Nodes corresponding to the name.
jpayne@68 1467 */
jpayne@68 1468 public List<TaxNode> getNodesByNameExtended(String name){
jpayne@68 1469 List<TaxNode> list=null;
jpayne@68 1470
jpayne@68 1471 list=getNodesByName(name);
jpayne@68 1472 if(list!=null){return list;}
jpayne@68 1473
jpayne@68 1474 name=name.replaceAll("_", " ").trim();
jpayne@68 1475 list=getNodesByName(name);
jpayne@68 1476 if(list!=null){return list;}
jpayne@68 1477
jpayne@68 1478 String[] split2=name.split(" ");
jpayne@68 1479
jpayne@68 1480 if(split2.length>7){
jpayne@68 1481 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7];
jpayne@68 1482 list=getNodesByName(term);
jpayne@68 1483 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1484 if(list!=null){return list;}
jpayne@68 1485 }
jpayne@68 1486
jpayne@68 1487 if(split2.length>6){
jpayne@68 1488 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6];
jpayne@68 1489 list=getNodesByName(term);
jpayne@68 1490 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1491 if(list!=null){return list;}
jpayne@68 1492 }
jpayne@68 1493
jpayne@68 1494 if(split2.length>5){
jpayne@68 1495 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5];
jpayne@68 1496 list=getNodesByName(term);
jpayne@68 1497 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1498 if(list!=null){return list;}
jpayne@68 1499 }
jpayne@68 1500 if(split2.length>4){
jpayne@68 1501 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4];
jpayne@68 1502 list=getNodesByName(term);
jpayne@68 1503 // System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1504 if(list!=null){return list;}
jpayne@68 1505 }
jpayne@68 1506 if(split2.length>3){
jpayne@68 1507 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3];
jpayne@68 1508 list=getNodesByName(term);
jpayne@68 1509 // System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1510 if(list!=null){return list;}
jpayne@68 1511 }
jpayne@68 1512 if(split2.length>2){
jpayne@68 1513 String term=split2[0]+" "+split2[1]+" "+split2[2];
jpayne@68 1514 list=getNodesByName(term);
jpayne@68 1515 // System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1516 if(list!=null){return list;}
jpayne@68 1517 }
jpayne@68 1518 if(split2.length>1){
jpayne@68 1519 String term=split2[0]+" "+split2[1];
jpayne@68 1520 list=getNodesByName(term);
jpayne@68 1521 // System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1522 if(list!=null){return list;}
jpayne@68 1523 }
jpayne@68 1524 if(split2.length>0){
jpayne@68 1525 String term=split2[0];
jpayne@68 1526 list=getNodesByName(term);
jpayne@68 1527 // System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
jpayne@68 1528 if(list!=null){return list;}
jpayne@68 1529 }
jpayne@68 1530
jpayne@68 1531 return null;
jpayne@68 1532 }
jpayne@68 1533
jpayne@68 1534 /*--------------------------------------------------------------*/
jpayne@68 1535 /*---------------- Assorted Methods ----------------*/
jpayne@68 1536 /*--------------------------------------------------------------*/
jpayne@68 1537
jpayne@68 1538 /**
jpayne@68 1539 * Return the TaxID of the lowest ancestor node at least the specified level,
jpayne@68 1540 * including this node itself. Level is the normal (non-extended) level.
jpayne@68 1541 * @param taxID
jpayne@68 1542 * @param taxLevel
jpayne@68 1543 * @return
jpayne@68 1544 */
jpayne@68 1545 public int promote(final int taxID, int taxLevel){
jpayne@68 1546 TaxNode tn=null;
jpayne@68 1547 tn=(taxID<1 ? null : getNode(taxID));
jpayne@68 1548 tn=promote(tn, taxLevel);
jpayne@68 1549 return (tn==null ? taxID : tn.id);
jpayne@68 1550 }
jpayne@68 1551
jpayne@68 1552 /**
jpayne@68 1553 * Fetch the first node in this node's lineage of at least the indicated level.
jpayne@68 1554 * This can be the node itself or an ancestor.
jpayne@68 1555 * @see getNodeAtLevelExtended
jpayne@68 1556 * @param tn Node in question
jpayne@68 1557 * @param taxLevel Desired minimum level
jpayne@68 1558 * @return A node at the desired level
jpayne@68 1559 */
jpayne@68 1560 public TaxNode promote(TaxNode tn, int taxLevel){
jpayne@68 1561 while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){
jpayne@68 1562 TaxNode temp=getNode(tn.pid);
jpayne@68 1563 if(temp==null || temp==tn || temp.level>=TaxTree.LIFE || temp.level>taxLevel){break;}
jpayne@68 1564 tn=temp;
jpayne@68 1565 }
jpayne@68 1566 return tn;
jpayne@68 1567 }
jpayne@68 1568
jpayne@68 1569 /**
jpayne@68 1570 * Determine the TaxID of the node's parent.
jpayne@68 1571 * @param id TaxID of child node
jpayne@68 1572 * @return Parent TaxID
jpayne@68 1573 */
jpayne@68 1574 public int getParentID(int id){
jpayne@68 1575 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
jpayne@68 1576 + "\nPlease redownload it and regenerate the taxtree.";
jpayne@68 1577 if(id<0 || id>=nodes.length){return -1;}
jpayne@68 1578 TaxNode tn=nodes[id];
jpayne@68 1579 if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);}
jpayne@68 1580 return tn==null ? -1 : tn.pid;
jpayne@68 1581 }
jpayne@68 1582
jpayne@68 1583 /**
jpayne@68 1584 * Fetch the node with this TaxID.
jpayne@68 1585 * @param id TaxID
jpayne@68 1586 * @return Node
jpayne@68 1587 */
jpayne@68 1588 public TaxNode getNode(int id){
jpayne@68 1589 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
jpayne@68 1590 + "\nPlease redownload it and regenerate the taxtree.";
jpayne@68 1591 if(id<0 || id>=nodes.length){return null;}
jpayne@68 1592 TaxNode tn=nodes[id];
jpayne@68 1593 if(tn!=null || mergedMap==null){return tn;}
jpayne@68 1594 return getNode(mergedMap.get(id), true);
jpayne@68 1595 }
jpayne@68 1596
jpayne@68 1597 /**
jpayne@68 1598 * Fetch the node with this TaxID, but don't throw assertions upon failure.
jpayne@68 1599 * @param id TaxID
jpayne@68 1600 * @return Node
jpayne@68 1601 */
jpayne@68 1602 public TaxNode getNode(int id, boolean skipAssertion){
jpayne@68 1603 assert(skipAssertion || id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
jpayne@68 1604 + "\nPlease redownload it and regenerate the taxtree.";
jpayne@68 1605 if(id<0 || id>=nodes.length){return null;}
jpayne@68 1606 TaxNode tn=nodes[id];
jpayne@68 1607 if(tn!=null || mergedMap==null){return tn;}
jpayne@68 1608 return getNode(mergedMap.get(id), true);
jpayne@68 1609 }
jpayne@68 1610
jpayne@68 1611 public TaxNode getNodeAtLevel(int id, int minLevel){
jpayne@68 1612 return getNodeAtLevel(id, minLevel, DOMAIN);
jpayne@68 1613 }
jpayne@68 1614
jpayne@68 1615 public TaxNode getNodeAtLevelExtended(int id, int minLevelE){
jpayne@68 1616 return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E);
jpayne@68 1617 }
jpayne@68 1618
jpayne@68 1619 public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){
jpayne@68 1620 final int minLevelExtended=levelToExtended(minLevel);
jpayne@68 1621 final int maxLevelExtended=levelToExtended(maxLevel);
jpayne@68 1622 return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended);
jpayne@68 1623 }
jpayne@68 1624
jpayne@68 1625 public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){
jpayne@68 1626 TaxNode tn=getNode(id);
jpayne@68 1627 while(tn!=null && tn.pid!=tn.id && tn.levelExtended<minLevelE){
jpayne@68 1628 TaxNode temp=getNode(tn.pid);
jpayne@68 1629 if(temp==null || temp.levelExtended>maxLevelE){break;}
jpayne@68 1630 tn=temp;
jpayne@68 1631 }
jpayne@68 1632 return tn;
jpayne@68 1633 }
jpayne@68 1634
jpayne@68 1635 public int getIdAtLevelExtended(int taxID, int taxLevelExtended){
jpayne@68 1636 if(taxLevelExtended<0){return taxID;}
jpayne@68 1637 TaxNode tn=getNode(taxID);
jpayne@68 1638 while(tn!=null && tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){
jpayne@68 1639 tn=getNode(tn.pid);
jpayne@68 1640 if(tn.levelExtended>taxLevelExtended){break;}
jpayne@68 1641 taxID=tn.id;
jpayne@68 1642 }
jpayne@68 1643 return taxID;
jpayne@68 1644 }
jpayne@68 1645
jpayne@68 1646 /**
jpayne@68 1647 * Fetch the node with this name.
jpayne@68 1648 * Throw an assertion if there are multiple such nodes.
jpayne@68 1649 * @param s Organism name.
jpayne@68 1650 * @return Node with given name.
jpayne@68 1651 */
jpayne@68 1652 public TaxNode getNodeByName(String s){
jpayne@68 1653 List<TaxNode> list=getNodesByName(s, false);
jpayne@68 1654 if(list==null){list=getNodesByName(s, true);}
jpayne@68 1655 if(list==null || list.isEmpty()){return null;}
jpayne@68 1656 if(list.size()==1){return list.get(0);}
jpayne@68 1657 assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n";
jpayne@68 1658 TaxNode a=list.get(0);
jpayne@68 1659 for(int i=1; i<list.size(); i++){
jpayne@68 1660 TaxNode b=list.get(i);
jpayne@68 1661 //Keep the most specific node
jpayne@68 1662 // if(a==null || (b!=null && b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf())){//not necessary
jpayne@68 1663 if(b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf()){
jpayne@68 1664 a=b;
jpayne@68 1665 }
jpayne@68 1666 }
jpayne@68 1667 return a;
jpayne@68 1668 }
jpayne@68 1669
jpayne@68 1670 /**
jpayne@68 1671 * Fetch a list of all nodes with this name.
jpayne@68 1672 * @param s Organism name.
jpayne@68 1673 * @return Nodes with given name.
jpayne@68 1674 */
jpayne@68 1675 public List<TaxNode> getNodesByName(String s){
jpayne@68 1676 List<TaxNode> list=getNodesByName(s, false);
jpayne@68 1677 if(list==null){list=getNodesByName(s, true);}
jpayne@68 1678 return list;
jpayne@68 1679 }
jpayne@68 1680
jpayne@68 1681 /**
jpayne@68 1682 * Fetch a map of names to nodes. If absent, create it first.
jpayne@68 1683 * @param lowercase If true, return the map with lowercase keys.
jpayne@68 1684 * @return Map of names to nodes.
jpayne@68 1685 */
jpayne@68 1686 private HashMap<String, ArrayList<TaxNode>> getMap(boolean lowercase){
jpayne@68 1687 HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap);
jpayne@68 1688 if(map==null){
jpayne@68 1689 synchronized(this){hashNames(true);}
jpayne@68 1690 map=(lowercase ? nameMapLower : nameMap);
jpayne@68 1691 }
jpayne@68 1692 assert(map!=null) : "Tax names were not hashed.";
jpayne@68 1693 return map;
jpayne@68 1694 }
jpayne@68 1695
jpayne@68 1696 private List<TaxNode> getNodesByName(String s, boolean lowercase){
jpayne@68 1697 if(s==null){return null;}
jpayne@68 1698 if(s.indexOf('_')>=0){s=s.replace('_', ' ');}
jpayne@68 1699 if(lowercase){s=s.toLowerCase();}
jpayne@68 1700 // System.err.println("Searching for "+s);
jpayne@68 1701 final HashMap<String, ArrayList<TaxNode>> map=getMap(lowercase);
jpayne@68 1702 ArrayList<TaxNode> list=map.get(s);
jpayne@68 1703 if(list!=null){return list;}
jpayne@68 1704 // System.err.println("No matches for '"+s+"'");
jpayne@68 1705
jpayne@68 1706 // assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s);
jpayne@68 1707
jpayne@68 1708 if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;}
jpayne@68 1709 String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8);
jpayne@68 1710 // System.err.println("Array: "+Arrays.toString(split));
jpayne@68 1711 list=map.get(split[split.length-1]);
jpayne@68 1712 if(list==null){return list;}
jpayne@68 1713 // System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")");
jpayne@68 1714
jpayne@68 1715 int matchCount=0;
jpayne@68 1716 for(TaxNode tn : list){
jpayne@68 1717 if(tn.matchesName(split, split.length-1, this)){matchCount++;}
jpayne@68 1718 }
jpayne@68 1719 if(matchCount==list.size()){return list;}
jpayne@68 1720 if(matchCount<1){return null;}
jpayne@68 1721 ArrayList<TaxNode> hits=new ArrayList<TaxNode>(matchCount);
jpayne@68 1722 for(TaxNode tn : list){
jpayne@68 1723 if(tn.matchesName(split, split.length-1, this)){hits.add(tn);}
jpayne@68 1724 }
jpayne@68 1725 return hits;
jpayne@68 1726 }
jpayne@68 1727 public ArrayList<TaxNode> getAncestors(int id){
jpayne@68 1728 TaxNode current=getNode(id);
jpayne@68 1729 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
jpayne@68 1730 while(current!=null && current.pid!=current.id){//ignores root
jpayne@68 1731 list.add(current);
jpayne@68 1732 current=getNode(current.pid);
jpayne@68 1733 }
jpayne@68 1734 //optionally add root here
jpayne@68 1735 return list;
jpayne@68 1736 }
jpayne@68 1737
jpayne@68 1738 public void increment(IntList ids, IntList counts, boolean sync){
jpayne@68 1739
jpayne@68 1740 ids.sort();
jpayne@68 1741 ids.getUniqueCounts(counts);
jpayne@68 1742
jpayne@68 1743 if(!sync){
jpayne@68 1744 for(int i=0; i<ids.size; i++){
jpayne@68 1745 int id=ids.get(i);
jpayne@68 1746 int count=counts.get(i);
jpayne@68 1747 incrementRaw(id, count);
jpayne@68 1748 }
jpayne@68 1749 }else{
jpayne@68 1750 synchronized(this){
jpayne@68 1751 for(int i=0; i<ids.size; i++){
jpayne@68 1752 int id=ids.get(i);
jpayne@68 1753 int count=counts.get(i);
jpayne@68 1754 incrementRaw(id, count);
jpayne@68 1755 }
jpayne@68 1756 }
jpayne@68 1757 }
jpayne@68 1758 }
jpayne@68 1759
jpayne@68 1760 public void incrementRaw(int id, long amt){
jpayne@68 1761 assert(id>=0 && id<nodes.length) : "TaxID "+id+" is out of range."+(id<0 ? "" : " Possibly the taxonomy data needs to be updated.");
jpayne@68 1762 assert(nodes[id]!=null) : "No node for TaxID "+id+"; possibly the taxonomy data needs to be updated.";
jpayne@68 1763 nodes[id].incrementRaw(amt);
jpayne@68 1764 }
jpayne@68 1765
jpayne@68 1766 public void percolateUp(){
jpayne@68 1767 for(int i=0; i<treeLevelsExtended.length; i++){
jpayne@68 1768 percolateUp(i);
jpayne@68 1769 }
jpayne@68 1770 }
jpayne@68 1771
jpayne@68 1772 public void percolateUp(final int fromLevel){
jpayne@68 1773 final TaxNode[] stratum=treeLevelsExtended[fromLevel];
jpayne@68 1774 for(final TaxNode n : stratum){
jpayne@68 1775 n.incrementSum(n.countRaw);
jpayne@68 1776 TaxNode parent=nodes[n.pid];
jpayne@68 1777 if(n!=parent){
jpayne@68 1778 parent.incrementSum(n.countSum);
jpayne@68 1779 }
jpayne@68 1780 }
jpayne@68 1781 }
jpayne@68 1782
jpayne@68 1783 /** Add this amount to the node and all its ancestors. */
jpayne@68 1784 public void percolateUp(TaxNode node, long amt){
jpayne@68 1785 if(amt==0){return;}
jpayne@68 1786 if(verbose){System.err.println("percolateUp("+amt+") node: "+node);}
jpayne@68 1787 while(node.id!=node.pid){
jpayne@68 1788 node.incrementSum(amt);
jpayne@68 1789 node=nodes[node.pid];
jpayne@68 1790 }
jpayne@68 1791 node.incrementSum(amt);
jpayne@68 1792 }
jpayne@68 1793
jpayne@68 1794 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){
jpayne@68 1795 return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1);
jpayne@68 1796 }
jpayne@68 1797
jpayne@68 1798 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){
jpayne@68 1799 final int minLevelExtended=levelToExtended(minLevel);
jpayne@68 1800 final int maxLevelExtended=levelToExtended(maxLevel);
jpayne@68 1801 // assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended;
jpayne@68 1802 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
jpayne@68 1803 for(int i=minLevelExtended; i<nodesPerLevelExtended.length && i<=maxLevelExtended; i++){
jpayne@68 1804 list.addAll(gatherNodesAtLeastLimitExtended(i, limit));
jpayne@68 1805 }
jpayne@68 1806 Shared.sort(list, TaxNode.countComparator);
jpayne@68 1807 return list;
jpayne@68 1808 }
jpayne@68 1809
jpayne@68 1810 public ArrayList<TaxNode> gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){
jpayne@68 1811 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
jpayne@68 1812 final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended];
jpayne@68 1813 for(final TaxNode n : stratum){
jpayne@68 1814 if(n.countSum>=limit){
jpayne@68 1815 list.add(n);
jpayne@68 1816 TaxNode parent=nodes[n.pid];
jpayne@68 1817 if(n!=parent){
jpayne@68 1818 percolateUp(parent, -n.countSum);//123 This was negative for some reason
jpayne@68 1819 }
jpayne@68 1820 }
jpayne@68 1821 }
jpayne@68 1822 Shared.sort(list, TaxNode.countComparator);
jpayne@68 1823 return list;
jpayne@68 1824 }
jpayne@68 1825
jpayne@68 1826 /*--------------------------------------------------------------*/
jpayne@68 1827 /*---------------- Static Initializers ----------------*/
jpayne@68 1828 /*--------------------------------------------------------------*/
jpayne@68 1829
jpayne@68 1830 /**
jpayne@68 1831 * Generate the name to level number map.
jpayne@68 1832 */
jpayne@68 1833 private static HashMap<String, Integer> makeLevelMap() {
jpayne@68 1834 HashMap<String, Integer> map=new HashMap<String, Integer>(31);
jpayne@68 1835 for(int i=0; i<taxLevelNames.length; i++){
jpayne@68 1836 map.put(taxLevelNames[i], i);
jpayne@68 1837 map.put(taxLevelNames[i].toUpperCase(), i);
jpayne@68 1838 }
jpayne@68 1839 map.put("clade", NO_RANK);
jpayne@68 1840 map.put("clade".toUpperCase(), NO_RANK);
jpayne@68 1841 return map;
jpayne@68 1842 }
jpayne@68 1843
jpayne@68 1844 /**
jpayne@68 1845 * Generate the name to extended level number map.
jpayne@68 1846 */
jpayne@68 1847 private static HashMap<String, Integer> makeLevelMapExtended() {
jpayne@68 1848 HashMap<String, Integer> map=new HashMap<String, Integer>(129);
jpayne@68 1849 for(int i=0; i<taxLevelNamesExtended.length; i++){
jpayne@68 1850 map.put(taxLevelNamesExtended[i], i);
jpayne@68 1851 map.put(taxLevelNamesExtended[i].toUpperCase(), i);
jpayne@68 1852 }
jpayne@68 1853 map.put("clade", NO_RANK_E);
jpayne@68 1854 map.put("clade".toUpperCase(), NO_RANK_E);
jpayne@68 1855 return map;
jpayne@68 1856 }
jpayne@68 1857
jpayne@68 1858 /**
jpayne@68 1859 * I think this maps normal and extend names to normal level numbers.
jpayne@68 1860 */
jpayne@68 1861 private static HashMap<String, Integer> makeAltLevelMap() {
jpayne@68 1862 HashMap<String, Integer> map=new HashMap<String, Integer>(129);
jpayne@68 1863 for(int i=0; i<taxLevelNames.length; i++){
jpayne@68 1864 map.put(taxLevelNames[i], i);
jpayne@68 1865 map.put(taxLevelNames[i].toUpperCase(), i);
jpayne@68 1866 }
jpayne@68 1867 map.put("clade", NO_RANK);
jpayne@68 1868 map.put("clade".toUpperCase(), NO_RANK);
jpayne@68 1869
jpayne@68 1870 //Add synonyms
jpayne@68 1871 // map.put("subfamily", map.get("family"));
jpayne@68 1872 // map.put("tribe", map.get("family"));
jpayne@68 1873 // map.put("varietas", map.get("subspecies"));
jpayne@68 1874 // map.put("subgenus", map.get("genus"));
jpayne@68 1875 // map.put("forma", map.get("subspecies"));
jpayne@68 1876 // map.put("species group", map.get("genus"));
jpayne@68 1877 // map.put("species subgroup", map.get("genus"));
jpayne@68 1878 // map.put("cohort", map.get("class"));
jpayne@68 1879 // map.put("subclass", map.get("class"));
jpayne@68 1880 // map.put("infraorder", map.get("order"));
jpayne@68 1881 // map.put("superorder", map.get("class"));
jpayne@68 1882 // map.put("subphylum", map.get("phylum"));
jpayne@68 1883 // map.put("infraclass", map.get("class"));
jpayne@68 1884 // map.put("superkingdom", map.get("division"));
jpayne@68 1885 // map.put("parvorder", map.get("order"));
jpayne@68 1886 // map.put("superclass", map.get("phylum"));
jpayne@68 1887 // map.put("superphylum", map.get("kingdom"));
jpayne@68 1888 // map.put("subkingdom", map.get("kingdom"));
jpayne@68 1889 // map.put("superfamily", map.get("order"));
jpayne@68 1890 // map.put("superkingdom", map.get("domain"));
jpayne@68 1891 // map.put("suborder", map.get("order"));
jpayne@68 1892 // map.put("subtribe", map.get("family"));
jpayne@68 1893
jpayne@68 1894 for(String[] array : taxLevelNamesExtendedMatrix){
jpayne@68 1895 String head=array[array.length-1];
jpayne@68 1896 Integer value=map.get(head);
jpayne@68 1897 assert(value!=null) : head;
jpayne@68 1898 for(String key : array){
jpayne@68 1899 if(key!=head){
jpayne@68 1900 assert(!map.containsKey(key)) : "Map already contains key "+key+": "+Arrays.toString(array);
jpayne@68 1901 map.put(key, value);
jpayne@68 1902 map.put(key.toUpperCase(), value);
jpayne@68 1903 }
jpayne@68 1904 }
jpayne@68 1905 }
jpayne@68 1906
jpayne@68 1907 return map;
jpayne@68 1908 }
jpayne@68 1909
jpayne@68 1910 /*--------------------------------------------------------------*/
jpayne@68 1911 /*---------------- Size ----------------*/
jpayne@68 1912 /*--------------------------------------------------------------*/
jpayne@68 1913
jpayne@68 1914 /** Number of bp associated with this node in RefSeq */
jpayne@68 1915 public long toSize(TaxNode tn){
jpayne@68 1916 if(tn==null){return 0;}
jpayne@68 1917 if(refseqSizeMap==null){return -1L;}
jpayne@68 1918 final long x=refseqSizeMap.get(tn.id);
jpayne@68 1919 return Tools.max(0, x);
jpayne@68 1920 }
jpayne@68 1921
jpayne@68 1922 /** Number of bp associated with this node and descendants in RefSeq */
jpayne@68 1923 public long toSizeC(TaxNode tn){
jpayne@68 1924 if(tn==null){return 0;}
jpayne@68 1925 if(refseqSizeMapC==null){return -1L;}
jpayne@68 1926 final long x=refseqSizeMapC.get(tn.id);
jpayne@68 1927 return Tools.max(0, x);
jpayne@68 1928 }
jpayne@68 1929
jpayne@68 1930 /** Number of sequences associated with this node in RefSeq */
jpayne@68 1931 public int toSeqs(TaxNode tn){
jpayne@68 1932 if(tn==null){return 0;}
jpayne@68 1933 if(refseqSeqMap==null){return -1;}
jpayne@68 1934 final int x=refseqSeqMap.get(tn.id);
jpayne@68 1935 return Tools.max(0, x);
jpayne@68 1936 }
jpayne@68 1937
jpayne@68 1938 /** Number of sequences associated with this node and descandants in RefSeq */
jpayne@68 1939 public long toSeqsC(TaxNode tn){
jpayne@68 1940 if(tn==null){return 0;}
jpayne@68 1941 if(refseqSeqMapC==null){return -1L;}
jpayne@68 1942 final long x=refseqSeqMapC.get(tn.id);
jpayne@68 1943 return Tools.max(0, x);
jpayne@68 1944 }
jpayne@68 1945
jpayne@68 1946 /** Number of descendants of this node */
jpayne@68 1947 public int toNodes(TaxNode tn){
jpayne@68 1948 if(tn==null){return 0;}
jpayne@68 1949 if(nodeMapC==null){return -1;}
jpayne@68 1950 final int x=nodeMapC.get(tn.id);
jpayne@68 1951 return Tools.max(0, x);
jpayne@68 1952 }
jpayne@68 1953
jpayne@68 1954 /**
jpayne@68 1955 * Fills refseqSizeMap, refseqSizeMapC, etc. from a file containing the summary.
jpayne@68 1956 * @param fname Size file name
jpayne@68 1957 */
jpayne@68 1958 public void loadSizeFile(String fname){
jpayne@68 1959 if(fname==null){return;}
jpayne@68 1960 assert(refseqSizeMap==null);
jpayne@68 1961 refseqSizeMap=new IntLongHashMap();
jpayne@68 1962 refseqSizeMapC=new IntLongHashMap();
jpayne@68 1963 refseqSeqMap=new IntHashMap();
jpayne@68 1964 refseqSeqMapC=new IntLongHashMap();
jpayne@68 1965 nodeMapC=new IntHashMap();
jpayne@68 1966
jpayne@68 1967 final ByteFile bf=ByteFile.makeByteFile(fname, true);
jpayne@68 1968 final byte delimiter='\t';
jpayne@68 1969 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
jpayne@68 1970 if(line.length>0 && line[0]!='#'){
jpayne@68 1971 int a=0, b=0;
jpayne@68 1972
jpayne@68 1973 while(b<line.length && line[b]!=delimiter){b++;}
jpayne@68 1974 assert(b>a) : "Missing field 0: "+new String(line);
jpayne@68 1975 int tid=Parse.parseInt(line, a, b);
jpayne@68 1976 b++;
jpayne@68 1977 a=b;
jpayne@68 1978
jpayne@68 1979 while(b<line.length && line[b]!=delimiter){b++;}
jpayne@68 1980 assert(b>a) : "Missing field 1: "+new String(line);
jpayne@68 1981 long size=Parse.parseLong(line, a, b);
jpayne@68 1982 b++;
jpayne@68 1983 a=b;
jpayne@68 1984
jpayne@68 1985 while(b<line.length && line[b]!=delimiter){b++;}
jpayne@68 1986 assert(b>a) : "Missing field 2: "+new String(line);
jpayne@68 1987 long csize=Parse.parseLong(line, a, b);
jpayne@68 1988 b++;
jpayne@68 1989 a=b;
jpayne@68 1990
jpayne@68 1991 while(b<line.length && line[b]!=delimiter){b++;}
jpayne@68 1992 assert(b>a) : "Missing field 3: "+new String(line);
jpayne@68 1993 int seqs=Parse.parseInt(line, a, b);
jpayne@68 1994 b++;
jpayne@68 1995 a=b;
jpayne@68 1996
jpayne@68 1997 while(b<line.length && line[b]!=delimiter){b++;}
jpayne@68 1998 assert(b>a) : "Missing field 4: "+new String(line);
jpayne@68 1999 long cseqs=Parse.parseLong(line, a, b);
jpayne@68 2000 b++;
jpayne@68 2001 a=b;
jpayne@68 2002
jpayne@68 2003 while(b<line.length && line[b]!=delimiter){b++;}
jpayne@68 2004 assert(b>a) : "Missing field 5: "+new String(line);
jpayne@68 2005 int cnodes=Parse.parseInt(line, a, b);
jpayne@68 2006 b++;
jpayne@68 2007 a=b;
jpayne@68 2008
jpayne@68 2009 if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);}
jpayne@68 2010 if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);}
jpayne@68 2011 if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);}
jpayne@68 2012 if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);}
jpayne@68 2013 if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);}
jpayne@68 2014 }
jpayne@68 2015 }
jpayne@68 2016 bf.close();
jpayne@68 2017 }
jpayne@68 2018
jpayne@68 2019 /*--------------------------------------------------------------*/
jpayne@68 2020 /*---------------- IMG ----------------*/
jpayne@68 2021 /*--------------------------------------------------------------*/
jpayne@68 2022
jpayne@68 2023 public static int imgToTaxid(long img){
jpayne@68 2024 ImgRecord ir=imgMap.get(img);
jpayne@68 2025 // assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n";
jpayne@68 2026 return ir==null ? -1 : ir.taxID;
jpayne@68 2027 }
jpayne@68 2028
jpayne@68 2029 public TaxNode imgToTaxNode(long img){
jpayne@68 2030 int tid=imgToTaxid(img);
jpayne@68 2031 return tid<1 ? null : getNode(tid);
jpayne@68 2032 }
jpayne@68 2033
jpayne@68 2034 // public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){
jpayne@68 2035 // assert(imgMap==null);
jpayne@68 2036 // if(fname==null){return 0;}
jpayne@68 2037 // ImgRecord2.storeName=storeName;
jpayne@68 2038 // if(outstream!=null){System.err.println("Loading IMG.");}
jpayne@68 2039 // Timer t=new Timer(outstream, false);
jpayne@68 2040 // ImgRecord2[] array=ImgRecord2.toArray(fname);
jpayne@68 2041 // int x=loadIMG(array);
jpayne@68 2042 // t.stopAndPrint();
jpayne@68 2043 // return x;
jpayne@68 2044 // }
jpayne@68 2045
jpayne@68 2046 public static int loadIMG(String fname, boolean storeName, PrintStream outstream){
jpayne@68 2047 assert(imgMap==null);
jpayne@68 2048 if(fname==null){return 0;}
jpayne@68 2049 ImgRecord.storeName=storeName;
jpayne@68 2050 if(outstream!=null){System.err.println("Loading IMG.");}
jpayne@68 2051 Timer t=new Timer(outstream, false);
jpayne@68 2052 ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ);
jpayne@68 2053 int x=loadIMG(array);
jpayne@68 2054 t.stopAndPrint();
jpayne@68 2055 return x;
jpayne@68 2056 }
jpayne@68 2057
jpayne@68 2058 public static int loadIMG(ImgRecord[] array){
jpayne@68 2059 assert(imgMap==null);
jpayne@68 2060 imgMap=new HashMap<Long, ImgRecord>((int)(array.length*1.5));
jpayne@68 2061 for(ImgRecord record : array){
jpayne@68 2062 imgMap.put(record.imgID, record);
jpayne@68 2063 }
jpayne@68 2064 return imgMap.size();
jpayne@68 2065 }
jpayne@68 2066
jpayne@68 2067 @Deprecated
jpayne@68 2068 public static int parseLevel(String b){
jpayne@68 2069 final int level;
jpayne@68 2070 if(b==null){level=-1;}
jpayne@68 2071 else if(Tools.isNumeric(b.charAt(0))){
jpayne@68 2072 level=Integer.parseInt(b);
jpayne@68 2073 }else{
jpayne@68 2074 level=stringToLevel(b.toLowerCase());
jpayne@68 2075 }
jpayne@68 2076 return level;
jpayne@68 2077 }
jpayne@68 2078
jpayne@68 2079 public static int parseLevelExtended(String b){
jpayne@68 2080 final int level;
jpayne@68 2081 if(b==null){level=-1;}
jpayne@68 2082 else if(Tools.isNumeric(b.charAt(0))){
jpayne@68 2083 level=levelToExtended(Integer.parseInt(b));
jpayne@68 2084 }else{
jpayne@68 2085 level=stringToLevelExtended(b.toLowerCase());
jpayne@68 2086 }
jpayne@68 2087 return level;
jpayne@68 2088 }
jpayne@68 2089
jpayne@68 2090 public boolean isUnclassified(int tid){
jpayne@68 2091 TaxNode tn=getNode(tid);
jpayne@68 2092 while(tn!=null && tn.id!=tn.pid){
jpayne@68 2093 if(tn.isUnclassified()){return true;}
jpayne@68 2094 if(tn.pid==tn.id){break;}
jpayne@68 2095 tn=getNode(tn.pid);
jpayne@68 2096 }
jpayne@68 2097 return false;
jpayne@68 2098 }
jpayne@68 2099
jpayne@68 2100 public boolean isEnvironmentalSample(int tid){
jpayne@68 2101 TaxNode tn=getNode(tid);
jpayne@68 2102 while(tn!=null && tn.id!=tn.pid){
jpayne@68 2103 if(tn.isEnvironmentalSample()){return true;}
jpayne@68 2104 if(tn.pid==tn.id){break;}
jpayne@68 2105 tn=getNode(tn.pid);
jpayne@68 2106 }
jpayne@68 2107 return false;
jpayne@68 2108 }
jpayne@68 2109
jpayne@68 2110 public boolean isVirus(int tid){
jpayne@68 2111 TaxNode tn=getNode(tid);
jpayne@68 2112 while(tn!=null && tn.id!=tn.pid){
jpayne@68 2113 if(tn.id==VIRUSES_ID){return true;}
jpayne@68 2114 if(tn.pid==tn.id){break;}
jpayne@68 2115 tn=getNode(tn.pid);
jpayne@68 2116 }
jpayne@68 2117 return false;
jpayne@68 2118 }
jpayne@68 2119
jpayne@68 2120 public long definedLevels(int tid){
jpayne@68 2121 long levels=0;
jpayne@68 2122 TaxNode tn=getNode(tid);
jpayne@68 2123 while(tn!=null && tn.id!=tn.pid){
jpayne@68 2124 levels=levels|(1L<<tn.level);
jpayne@68 2125 }
jpayne@68 2126 return levels;
jpayne@68 2127 }
jpayne@68 2128
jpayne@68 2129 public long definedLevelsExtended(int tid){
jpayne@68 2130 long levels=0;
jpayne@68 2131 TaxNode tn=getNode(tid);
jpayne@68 2132 while(tn!=null && tn.id!=tn.pid){
jpayne@68 2133 levels=levels|(1L<<tn.levelExtended);
jpayne@68 2134 }
jpayne@68 2135 return levels;
jpayne@68 2136 }
jpayne@68 2137
jpayne@68 2138 /**
jpayne@68 2139 * Generates the mononomial name for this taxonomic level based on the scientific name.
jpayne@68 2140 * For example, "Homo sapiens" -> "Sapiens"
jpayne@68 2141 * @param tid TaxID
jpayne@68 2142 * @return Correct name for this node.
jpayne@68 2143 */
jpayne@68 2144 public String mononomial(int tid){return mononomial(getNode(tid));}
jpayne@68 2145 public String mononomial(TaxNode tn){
jpayne@68 2146 if(tn==null){return null;}
jpayne@68 2147 String name=tn.name;
jpayne@68 2148 if(name.indexOf(' ')<0){return name;}
jpayne@68 2149 TaxNode parent=getNode(tn.pid);
jpayne@68 2150 if(parent==null){return name;}
jpayne@68 2151 String pname=parent.name;
jpayne@68 2152 if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){
jpayne@68 2153 name=name.substring(pname.length()+1);
jpayne@68 2154 }
jpayne@68 2155 return name;
jpayne@68 2156 }
jpayne@68 2157
jpayne@68 2158 /*--------------------------------------------------------------*/
jpayne@68 2159 /*---------------- Fields ----------------*/
jpayne@68 2160 /*--------------------------------------------------------------*/
jpayne@68 2161
jpayne@68 2162 /** All nodes in the tree in a flat array, indexed by TaxiD */
jpayne@68 2163 public final TaxNode[] nodes;
jpayne@68 2164
jpayne@68 2165 /** Number of nodes per normal level */
jpayne@68 2166 public final int[] nodesPerLevel=new int[taxLevelNames.length];
jpayne@68 2167
jpayne@68 2168 /** Number of nodes per extended level */
jpayne@68 2169 public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length];
jpayne@68 2170
jpayne@68 2171 /** Number of nodes in the tree */
jpayne@68 2172 public final int nodeCount;
jpayne@68 2173
jpayne@68 2174 /** Maps old TaxIDs to new TaxIDs */
jpayne@68 2175 public final IntHashMap mergedMap;
jpayne@68 2176
jpayne@68 2177 /** Arrays of all nodes at a given taxonomic level (extended) */
jpayne@68 2178 public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][];
jpayne@68 2179
jpayne@68 2180 /** Map of names to nodes */
jpayne@68 2181 HashMap<String, ArrayList<TaxNode>> nameMap;
jpayne@68 2182 /** Map of lowercase names to nodes */
jpayne@68 2183 HashMap<String, ArrayList<TaxNode>> nameMapLower;
jpayne@68 2184 /** Map of nodes to child nodes */
jpayne@68 2185 HashMap<TaxNode, ArrayList<TaxNode>> childMap;
jpayne@68 2186 public HashMap<String, ArrayList<TaxNode>> nameMap(){return nameMap;}
jpayne@68 2187
jpayne@68 2188 @Deprecated
jpayne@68 2189 public int minValidTaxa=0; //TODO: Remove (will break serialization)
jpayne@68 2190
jpayne@68 2191 /** Infer ranks for no-rank nodes, when possible */
jpayne@68 2192 public boolean simplify=true;
jpayne@68 2193 /** See simplify() for details, works in conjunction with simplify */
jpayne@68 2194 public boolean reassign=true;
jpayne@68 2195 /** Discard no-rank nodes */
jpayne@68 2196 public boolean skipNorank=false;
jpayne@68 2197 public int inferRankLimit=0;//levelMap.get("species");
jpayne@68 2198
jpayne@68 2199 //Node Statistics
jpayne@68 2200 /** Number of bases assigned to this TaxID in RefSeq */
jpayne@68 2201 private IntLongHashMap refseqSizeMap;
jpayne@68 2202 /** Number of bases assigned to this TaxID and descendants in RefSeq */
jpayne@68 2203 private IntLongHashMap refseqSizeMapC;
jpayne@68 2204 /** Number of sequences assigned to this TaxID in RefSeq */
jpayne@68 2205 private IntHashMap refseqSeqMap;
jpayne@68 2206 /** Number of sequences assigned to this TaxID and descendants in RefSeq */
jpayne@68 2207 private IntLongHashMap refseqSeqMapC;
jpayne@68 2208 /** Number of descendant nodes, inclusive, for each TaxID */
jpayne@68 2209 private IntHashMap nodeMapC;
jpayne@68 2210
jpayne@68 2211 /*--------------------------------------------------------------*/
jpayne@68 2212 /*---------------- Statics ----------------*/
jpayne@68 2213 /*--------------------------------------------------------------*/
jpayne@68 2214
jpayne@68 2215 /** Assign levels to unranked nodes below species level, when possible */
jpayne@68 2216 public static boolean assignStrains=true;
jpayne@68 2217 /** Assume headers are in Silva format */
jpayne@68 2218 public static boolean SILVA_MODE=false;
jpayne@68 2219 /** Assume headers are in Unite format */
jpayne@68 2220 public static boolean UNITE_MODE=false;
jpayne@68 2221 /** Probably unnecessary at this point... present for legacy reasons */
jpayne@68 2222 public static boolean CRASH_IF_NO_GI_TABLE=true;
jpayne@68 2223
jpayne@68 2224 public static boolean verbose=false;
jpayne@68 2225 public static boolean SHOW_WARNINGS=false;
jpayne@68 2226
jpayne@68 2227 /** Maps IMG IDs to records from the dump file */
jpayne@68 2228 private static HashMap<Long, ImgRecord> imgMap;
jpayne@68 2229
jpayne@68 2230 /** Set to false if the tree is expected to be mutated.
jpayne@68 2231 * @TODO Remove mutable fields from the tree (like counters).
jpayne@68 2232 */
jpayne@68 2233 public static boolean ALLOW_SHARED_TREE=true;
jpayne@68 2234
jpayne@68 2235 /** Universal location of the shared TaxTree used by various classes */
jpayne@68 2236 private static TaxTree sharedTree;
jpayne@68 2237
jpayne@68 2238 /** A simpler and probably less safe version of sharedTree(...) */
jpayne@68 2239 public static TaxTree getTree(){return sharedTree;}
jpayne@68 2240
jpayne@68 2241 /**
jpayne@68 2242 * Fetch the shared tree, loading it from file if not present.
jpayne@68 2243 * @return A tree.
jpayne@68 2244 * @TODO: Check proper-construction of double-checked synchronize
jpayne@68 2245 */
jpayne@68 2246 private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) {
jpayne@68 2247 if(!ALLOW_SHARED_TREE){return null;}
jpayne@68 2248 if(sharedTree==null && fname!=null){
jpayne@68 2249 if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();}
jpayne@68 2250 synchronized(TaxTree.class){
jpayne@68 2251 if(sharedTree==null){
jpayne@68 2252 if(outstream!=null){outstream.println("Loading tax tree.");}
jpayne@68 2253 Timer t=new Timer(outstream, false);
jpayne@68 2254 setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat);
jpayne@68 2255 t.stopAndPrint();
jpayne@68 2256 }
jpayne@68 2257 }
jpayne@68 2258 }
jpayne@68 2259 if(hashNames && sharedTree.nameMap==null){
jpayne@68 2260 synchronized(sharedTree){
jpayne@68 2261 if(sharedTree.nameMap==null){
jpayne@68 2262 if(outstream!=null){outstream.println("Hashing names.");}
jpayne@68 2263 Timer t=new Timer(outstream, false);
jpayne@68 2264 sharedTree.hashNames(hashDotFormat);
jpayne@68 2265 t.stopAndPrint();
jpayne@68 2266 }
jpayne@68 2267 }
jpayne@68 2268 }
jpayne@68 2269 return sharedTree;
jpayne@68 2270 }
jpayne@68 2271
jpayne@68 2272 /**
jpayne@68 2273 * For initialization. Normally only one tree is needed by a process so it is set here.
jpayne@68 2274 * If the tree is already set nothing will happen, unless additional hashing is needed.
jpayne@68 2275 */
jpayne@68 2276 private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){
jpayne@68 2277 assert(ALLOW_SHARED_TREE);
jpayne@68 2278 assert(sharedTree==null);
jpayne@68 2279 sharedTree=tree;
jpayne@68 2280 if(hashNames && sharedTree.nameMap==null){
jpayne@68 2281 synchronized(sharedTree){
jpayne@68 2282 if(sharedTree.nameMap==null){
jpayne@68 2283 sharedTree.hashNames(hashDotFormat);
jpayne@68 2284 }
jpayne@68 2285 }
jpayne@68 2286 }
jpayne@68 2287 }
jpayne@68 2288
jpayne@68 2289 /**
jpayne@68 2290 * Determine whether a taxonomic level is standard. e.g.:<br>
jpayne@68 2291 * isSimple("phylum")=true<br>
jpayne@68 2292 * isSimple("subphylum")=false<br>
jpayne@68 2293 * isSimple("no-rank")=false
jpayne@68 2294 * @param levelExtended The extended level to test.
jpayne@68 2295 * @return True if this level is not no-rank, and the names of the normal and extended levels match.
jpayne@68 2296 */
jpayne@68 2297 public static boolean isSimple(int levelExtended){
jpayne@68 2298 int level=extendedToLevel(levelExtended);
jpayne@68 2299 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level));
jpayne@68 2300 }
jpayne@68 2301
jpayne@68 2302 /**
jpayne@68 2303 * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:<br>
jpayne@68 2304 * isSimple("phylum")=true<br>
jpayne@68 2305 * isSimple("substrain")=true<br>
jpayne@68 2306 * isSimple("subphylum")=false<br>
jpayne@68 2307 * isSimple("no-rank")=false
jpayne@68 2308 * @param levelExtended The extended level to test.
jpayne@68 2309 * @return True if this level is not no-rank, and the names of the normal and extended levels match.
jpayne@68 2310 */
jpayne@68 2311 public static boolean isSimple2(int levelExtended){
jpayne@68 2312 int level=extendedToLevel(levelExtended);
jpayne@68 2313 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)
jpayne@68 2314 || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E);
jpayne@68 2315 }
jpayne@68 2316
jpayne@68 2317 /*--------------------------------------------------------------*/
jpayne@68 2318 /*---------------- Constants ----------------*/
jpayne@68 2319 /*--------------------------------------------------------------*/
jpayne@68 2320
jpayne@68 2321 /** Get the number for the normal level of this name */
jpayne@68 2322 public static final int stringToLevel(String s){return altLevelMap.get(s);}
jpayne@68 2323 public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);}
jpayne@68 2324 /** Get the number for the extended level of this name */
jpayne@68 2325 public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);}
jpayne@68 2326 /** Get the normal name for this normal level */
jpayne@68 2327 public static final String levelToString(int x){return taxLevelNames[x];}
jpayne@68 2328 /** Get the extended name for this extended level */
jpayne@68 2329 public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];}
jpayne@68 2330 /** Get the abbreviated name for this normal level */
jpayne@68 2331 public static final String levelToStringShort(int x){return taxLevelNamesShort[x];}
jpayne@68 2332
jpayne@68 2333 /** Normal, aka canonical, aka simple tax level names */
jpayne@68 2334 private static final String[] taxLevelNames=new String[] {
jpayne@68 2335 "no rank", "subspecies", "species", "genus",
jpayne@68 2336 "family", "order", "class", "phylum",
jpayne@68 2337 "kingdom", "superkingdom", "domain", "life"
jpayne@68 2338 };
jpayne@68 2339 public static final int numTaxLevelNames=taxLevelNames.length;
jpayne@68 2340
jpayne@68 2341 /**
jpayne@68 2342 * Definitive representation of all NCBI taxonomic level names.
jpayne@68 2343 * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash.
jpayne@68 2344 * The first dimension maps normal ranks to extended ranks.
jpayne@68 2345 * Both dimensions are ordered ascending.
jpayne@68 2346 * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels().
jpayne@68 2347 */
jpayne@68 2348 //TODO See @TODO
jpayne@68 2349 private static final String[][] taxLevelNamesExtendedMatrix=new String[][] {
jpayne@68 2350 {"no rank"},
jpayne@68 2351 {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup",
jpayne@68 2352 "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas",
jpayne@68 2353 "subspecies"},
jpayne@68 2354 {"species"},
jpayne@68 2355 {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"},
jpayne@68 2356 {"subtribe", "tribe", "subfamily", "family"},
jpayne@68 2357 {"superfamily", "parvorder", "infraorder", "suborder", "order"},
jpayne@68 2358 {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"},
jpayne@68 2359 {"superclass", "subdivision", "division", "subphylum", "phylum"},
jpayne@68 2360 {"superphylum", "subkingdom", "kingdom"},
jpayne@68 2361 {"superkingdom"},
jpayne@68 2362 {"domain"},
jpayne@68 2363 {"life"}
jpayne@68 2364 };
jpayne@68 2365
jpayne@68 2366 /** Extended tax level names as a 1D array */
jpayne@68 2367 private static final String[] taxLevelNamesExtended=makeNamesExtended();
jpayne@68 2368 /** Number of extended tax levels */
jpayne@68 2369 public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length;
jpayne@68 2370
jpayne@68 2371 /** Flatten the extended tax level names matrix to a 1D array */
jpayne@68 2372 private static final String[] makeNamesExtended(){
jpayne@68 2373 ArrayList<String> list=new ArrayList<String>();
jpayne@68 2374 for(String[] s : taxLevelNamesExtendedMatrix){
jpayne@68 2375 for(String ss : s){
jpayne@68 2376 list.add(ss);
jpayne@68 2377 }
jpayne@68 2378 }
jpayne@68 2379 return list.toArray(new String[0]);
jpayne@68 2380 }
jpayne@68 2381
jpayne@68 2382 /** Abbreviations of tax level names, mainly for semicolon form */
jpayne@68 2383 private static final String[] taxLevelNamesShort=new String[] {
jpayne@68 2384 "nr", "ss", "s", "g",
jpayne@68 2385 "f", "o", "c", "p",
jpayne@68 2386 "k", "sk", "d", "l"
jpayne@68 2387 };
jpayne@68 2388
jpayne@68 2389 /** Normal tax level numbers as constants */
jpayne@68 2390 public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3,
jpayne@68 2391 FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11;
jpayne@68 2392
jpayne@68 2393 /** TaxID of Life node */
jpayne@68 2394 public static final int LIFE_ID=1;
jpayne@68 2395 /** TaxID of Cellular Organisms node */
jpayne@68 2396 public static final int CELLULAR_ORGANISMS_ID=131567;
jpayne@68 2397 /** TaxID of Bacteria node */
jpayne@68 2398 public static final int BACTERIA_ID=2; //Is this safe? Who knows...
jpayne@68 2399 /** TaxID of Archaea node */
jpayne@68 2400 public static final int ARCHAEA_ID=2157;
jpayne@68 2401 /** TaxID of Euk node */
jpayne@68 2402 public static final int EUKARYOTA_ID=2759;
jpayne@68 2403 /** TaxID of Animal node */
jpayne@68 2404 public static final int METAZOA_ID=33208, ANIMALIA_ID=33208;
jpayne@68 2405 /** TaxID of Plant node */
jpayne@68 2406 public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090;
jpayne@68 2407 /** TaxID of Fungi node */
jpayne@68 2408 public static final int FUNGI_ID=4751;
jpayne@68 2409 /** TaxID of Virus node */
jpayne@68 2410 public static final int VIRUSES_ID=10239;
jpayne@68 2411 /** TaxID of Viroids node (now defunct) */
jpayne@68 2412 public static final int VIROIDS_ID=12884;
jpayne@68 2413
jpayne@68 2414 /** Maps normal level names to normal level numbers */
jpayne@68 2415 private static final HashMap<String, Integer> levelMap=makeLevelMap();
jpayne@68 2416 /** Maps extended level names to extended level numbers */
jpayne@68 2417 private static final HashMap<String, Integer> levelMapExtended=makeLevelMapExtended();
jpayne@68 2418 /** Maps extended level names to normal level numbers */
jpayne@68 2419 private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap();
jpayne@68 2420
jpayne@68 2421 /** Common extended level numbers as constants */
jpayne@68 2422 public static final int NO_RANK_E=NO_RANK,
jpayne@68 2423 SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"),
jpayne@68 2424 SUBSPECIES_E=stringToLevelExtended("subspecies"),
jpayne@68 2425 SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"),
jpayne@68 2426 FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"),
jpayne@68 2427 CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"),
jpayne@68 2428 KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"),
jpayne@68 2429 DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life");
jpayne@68 2430
jpayne@68 2431 /** Map of normal to extended level numbers */
jpayne@68 2432 private static final int[] levelToExtended=new int[] {
jpayne@68 2433 NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E,
jpayne@68 2434 ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E
jpayne@68 2435 };
jpayne@68 2436
jpayne@68 2437 /** Map of extended to normal level numbers */
jpayne@68 2438 private static final int[] extendedToLevel=makeExtendedToLevel();
jpayne@68 2439
jpayne@68 2440 /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */
jpayne@68 2441 private static int[] makeExtendedToLevel(){
jpayne@68 2442 int len=0;
jpayne@68 2443 for(String[] array : taxLevelNamesExtendedMatrix){
jpayne@68 2444 len+=array.length;
jpayne@68 2445 }
jpayne@68 2446 int[] ret=new int[len];
jpayne@68 2447
jpayne@68 2448 int pos=0;
jpayne@68 2449 for(int level=0; level<taxLevelNamesExtendedMatrix.length; level++){
jpayne@68 2450 String[] array=taxLevelNamesExtendedMatrix[level];
jpayne@68 2451 for(int i=0; i<array.length; i++){
jpayne@68 2452 ret[pos]=level;
jpayne@68 2453 pos++;
jpayne@68 2454 }
jpayne@68 2455 }
jpayne@68 2456 return ret;
jpayne@68 2457 }
jpayne@68 2458
jpayne@68 2459 /** Convert a standard level number (like KINGDOM) to extended (like KINGDOM_E). */
jpayne@68 2460 public static final int levelToExtended(int level){
jpayne@68 2461 return level<0 ? level : levelToExtended[level];
jpayne@68 2462 }
jpayne@68 2463
jpayne@68 2464 /** Convert an extended level number (like PHYLUM_E) to extended (like PHYLUM).
jpayne@68 2465 * Non-standard levels will be converted to the next higher standard level;
jpayne@68 2466 * e.g., subphylum -> phylum */
jpayne@68 2467 public static final int extendedToLevel(int extended){
jpayne@68 2468 return extended<0 ? -1 : extendedToLevel[extended];
jpayne@68 2469 }
jpayne@68 2470
jpayne@68 2471 /* Pre-compiled delimiters to save time when splitting lines */
jpayne@68 2472 private static final Pattern delimiterTab = Pattern.compile("\t");
jpayne@68 2473 private static final Pattern delimiter = Pattern.compile("\t\\|\t");
jpayne@68 2474 private static final Pattern delimiterPipe = Pattern.compile("\\|");
jpayne@68 2475 private static final Pattern delimiterTilde = Pattern.compile("\\~");
jpayne@68 2476 private static final Pattern delimiter2 = Pattern.compile("[\\s_]+");
jpayne@68 2477
jpayne@68 2478 public static boolean IMG_HQ=false;
jpayne@68 2479
jpayne@68 2480 /* For these fields, see the corresponding functions, below.
jpayne@68 2481 * They define the default paths to various data on NERSC. */
jpayne@68 2482
jpayne@68 2483 private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest";
jpayne@68 2484 private static final String defaultTaxPathAws="/test1/tax/latest";
jpayne@68 2485 private static final String defaultTaxPathIGBVM="/data/tax/latest";
jpayne@68 2486 private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
jpayne@68 2487 private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
jpayne@68 2488 private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
jpayne@68 2489 private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
jpayne@68 2490 private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz";
jpayne@68 2491 private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
jpayne@68 2492
jpayne@68 2493 private static final String defaultImgFile="TAX_PATH/imgDump.txt";
jpayne@68 2494 private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz";
jpayne@68 2495 private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz";
jpayne@68 2496 private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz";
jpayne@68 2497 private static final String defaultPatternFile="TAX_PATH/patterns.txt";
jpayne@68 2498 private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz";
jpayne@68 2499
jpayne@68 2500 private static final String defaultAccessionFile=
jpayne@68 2501 //"TAX_PATH/shrunk.protF.accession2taxid.gz," +
jpayne@68 2502 "TAX_PATH/shrunk.prot.accession2taxid.gz,"
jpayne@68 2503 + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz,"
jpayne@68 2504 + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz,"
jpayne@68 2505 + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz,"
jpayne@68 2506 // + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz,"
jpayne@68 2507 + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz,"
jpayne@68 2508 // + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz,"
jpayne@68 2509 + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz,"
jpayne@68 2510 + "TAX_PATH/shrunk.pdb.accession2taxid.gz";
jpayne@68 2511
jpayne@68 2512 /** For setting TAX_PATH, the root to taxonomy files */
jpayne@68 2513 public static final String defaultTaxPath(){
jpayne@68 2514 return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc;
jpayne@68 2515 }
jpayne@68 2516
jpayne@68 2517 /** 16S consensus sequences per TaxID */
jpayne@68 2518 public static final String default16SFile(){
jpayne@68 2519 return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc;
jpayne@68 2520 }
jpayne@68 2521
jpayne@68 2522 /** 18S consensus sequences per TaxID */
jpayne@68 2523 public static final String default18SFile(){
jpayne@68 2524 return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc;
jpayne@68 2525 }
jpayne@68 2526
jpayne@68 2527 /** Path to all taxonomy files, substituted in to make specific file paths */
jpayne@68 2528 public static String TAX_PATH=defaultTaxPath();
jpayne@68 2529
jpayne@68 2530 /** Location of gitable.int2d.gz for gi lookups */
jpayne@68 2531 public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);}
jpayne@68 2532 /** Location of tree.taxtree.gz */
jpayne@68 2533 public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);}
jpayne@68 2534
jpayne@68 2535 //Use the prot.FULL.gz ncbi file.
jpayne@68 2536 public static boolean protFull=false;
jpayne@68 2537
jpayne@68 2538 /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */
jpayne@68 2539 public static final String defaultAccessionFile(){
jpayne@68 2540 String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile;
jpayne@68 2541 return s.replaceAll("TAX_PATH", TAX_PATH);
jpayne@68 2542 }
jpayne@68 2543 /** Location of patterns.txt, which holds information about observed accession string formats */
jpayne@68 2544 public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);}
jpayne@68 2545 /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */
jpayne@68 2546 public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);}
jpayne@68 2547 /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */
jpayne@68 2548 public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);}
jpayne@68 2549
jpayne@68 2550 /** Screen output gets printed here */
jpayne@68 2551 private static PrintStream outstream=System.out;
jpayne@68 2552
jpayne@68 2553 }