Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/TaxTree.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/TaxTree.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,2553 @@ +package tax; + +import java.io.PrintStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.regex.Pattern; + +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.TextFile; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import structures.ByteBuilder; +import structures.IntHashMap; +import structures.IntList; +import structures.IntLongHashMap; + +/** + * Represents a taxonomic tree. + * Usually just one of these needs to be created for a process. + * Designed for NCBI's taxdmp.zip file contents. + * @author Brian Bushnell + * @date Mar 6, 2015 + * + */ +public class TaxTree implements Serializable{ + + /** + * + */ + private static final long serialVersionUID = 5894416521711540017L; + + /*--------------------------------------------------------------*/ + /*---------------- Main ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * This is not called normally, only when converting NCBI text files + * into a binary representation and writing it to disk. + * @param args Command line arguments + */ + public static void main(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, outstream, null, false); + args=pp.args; + outstream=pp.outstream; + } + + assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n"; + ReadWrite.USE_UNPIGZ=true; + ReadWrite.USE_PIGZ=true; + ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9); + ReadWrite.PIGZ_BLOCKSIZE=256; + ReadWrite.PIGZ_ITERATIONS=60; + + Timer t=new Timer(); + TaxTree tree=new TaxTree(args); + t.stop(); + + outstream.println("Retained "+tree.nodeCount+" nodes:"); + + for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){ + outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]); + if(verbose){ + int lim=10; + for(int j=0; j<lim && j<tree.treeLevelsExtended[i].length; j++){ + TaxNode n=tree.treeLevelsExtended[i][j]; + outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); + } + for(int j=tree.treeLevelsExtended[i].length-lim; j<tree.treeLevelsExtended[i].length; j++){ + if(j>=lim){ + TaxNode n=tree.treeLevelsExtended[i][j]; + outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); + } + } + } + outstream.println(); + } + + + outstream.println(); + outstream.println("Time: \t"+t); + + if(args.length>2){//Write a tree + ReadWrite.write(tree, args[3], true); + } + } + + /** Parse arguments from the command line */ + private Parser parse(String[] args){ + + //Create a parser object + Parser parser=new Parser(); + + //Set any necessary Parser defaults here + //parser.foo=bar; + + //Parse each argument + for(int i=0; i<args.length; i++){ + String arg=args[i]; + + //Break arguments into their constituent parts, in the form of "a=b" + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); + }else if(a.equals("parse_flag_goes_here")){ + long fake_variable=Parse.parseKMG(b); + //Set a variable here + }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser + //do nothing + }else if(i>3){ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + } + } + + return parser; + } + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /*--------------------------------------------------------------*/ + /*---------------- Constructors ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Constructor using filenames from command line arguments, in the format of: + * {names, nodes, merged} + * @param args Command line arguments + */ + private TaxTree(String[] args){ + this(args[0], args[1], args[2], args); + } + + /** + * @param namesFile NCBI names.txt + * @param nodesFile NCBI nodes.txt + * @param mergedFile NCBI merged.txt + * @param args + */ + private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){ + + if(args!=null) { + Parser parser=parse(args); + } + + nodes=getNames(namesFile); + getNodes(nodesFile, nodes); + + mergedMap=getMerged(mergedFile); + + countChildren(); + outstream.println("Counted children."); + int rounds=percolate(); + outstream.println("Percolated "+rounds+" rounds to fixpoint."); + + if(assignStrains){ + assignStrains(); + rounds=percolate(); + outstream.println("Percolated "+rounds+" rounds to fixpoint."); + } + + if(simplify){ + if(verbose){outstream.println("Simplifying.");} + int removed=simplify(nodes); + if(verbose){outstream.println("Removed "+removed+" nodes.");} + rounds=percolate(); + outstream.println("Percolated "+rounds+" rounds to fixpoint."); + } + int errors=test(nodes); +// assert(errors==0); //Not possible since the tree is wrong. + if(errors>0) { + System.err.println("Found "+errors+" errors in tree."); + } + + for(TaxNode n : nodes){ + if(n!=null){ + nodesPerLevel[n.level]++; + nodesPerLevelExtended[n.levelExtended]++; + } + } + +// for(int i=0; i<nodesPerLevel.length; i++){ +// treeLevels[i]=new TaxNode[nodesPerLevel[i]]; +// } + for(int i=0; i<nodesPerLevelExtended.length; i++){ + treeLevelsExtended[i]=new TaxNode[nodesPerLevelExtended[i]]; + } + +// { +// int[] temp=new int[nodesPerLevel.length]; +// for(TaxNode n : nodes){ +// if(n!=null){ +// int level=n.level; +// treeLevels[level][temp[level]]=n; +// temp[level]++; +// } +// } +// } + + { + int[] temp=new int[nodesPerLevelExtended.length]; + for(TaxNode n : nodes){ + if(n!=null){ + int level=n.levelExtended; + treeLevelsExtended[level][temp[level]]=n; + temp[level]++; + } + } + } + nodeCount=(int)Tools.sum(nodesPerLevelExtended); + + } + + /*--------------------------------------------------------------*/ + /*--------------- Loaders ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Load a tax tree from disk. + * @param taxTreeFile Serialized TaxTree. + * @param hashNames Hash nodes using names as keys + * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens + * @return + */ + public static final TaxTree loadTaxTree(String taxTreeFile, PrintStream outstream, boolean hashNames, + boolean hashDotFormat){ + if(taxTreeFile==null){return null;} + return loadTaxTree(taxTreeFile, null, null, null, outstream, hashNames, hashDotFormat); + } + + /** + * Load a tax tree from disk, either from a binary tree file, + * or from NCBI text files. + * @param taxTreeFile Binary representation; mutually exclusive with other files. + * @param taxNameFile NCBI names.txt + * @param taxNodeFile NCBI nodes.txt + * @param taxMergedFile NCBI merged.txt + * @param hashNames Hash nodes using names as keys + * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens + * @return The loaded tree + */ + public static final TaxTree loadTaxTree(String taxTreeFile, String taxNameFile, String taxNodeFile, + String taxMergedFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){ + if(taxTreeFile!=null || taxNodeFile==null){ + TaxTree tree=sharedTree(taxTreeFile, hashNames, hashDotFormat, outstream); + if(tree!=null){return tree;} + } + if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=defaultTreeFile();} + assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files."; + Timer t=new Timer(); + if(outstream!=null){outstream.print("\nLoading tax tree; ");} + final TaxTree tree; + if(taxTreeFile!=null){ + tree=ReadWrite.read(TaxTree.class, taxTreeFile, true); + }else{ + tree=new TaxTree(taxNameFile, taxNodeFile, taxMergedFile, null); + } + t.stop(); + if(hashNames){ + if(outstream!=null){outstream.println("Hashing names.");} + tree.hashNames(hashDotFormat); + } + if(outstream!=null){ + outstream.println("Time: \t"+t); + Shared.printMemory(); + outstream.println(); + } + if(ALLOW_SHARED_TREE){sharedTree=tree;} + return tree; + } + + /*--------------------------------------------------------------*/ + /*--------------- Constructor Helpers ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Finds unranked nodes in the archaeal and bacterial kingdoms. + * If these are below species level, have a ranked parent, + * and have no ranked children, they are assigned strain or substrain. + */ + private void assignStrains(){ + + outstream.println("Assigning strains."); + int strains=0, substrains=0; + TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed yet + TaxNode archaea=getNode(ARCHAEA_ID); + assert(bacteria.name.equalsIgnoreCase("Bacteria")); + assert(archaea.name.equalsIgnoreCase("Archaea")); + + ArrayList<TaxNode> bactList=new ArrayList<TaxNode>(); + ArrayList<TaxNode> archList=new ArrayList<TaxNode>(); + for(TaxNode tn : nodes){ + if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){ + if(descendsFrom(tn, bacteria)){ + bactList.add(tn); + }else if(descendsFrom(tn, archaea)){ + archList.add(tn); + } + } + } + + ArrayList<TaxNode> prokList=new ArrayList<TaxNode>(bactList.size()+archList.size()); + prokList.addAll(bactList); + prokList.addAll(archList); + + for(TaxNode tn : prokList){ + if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ + TaxNode parent=nodes[tn.pid]; + if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ + tn.levelExtended=STRAIN_E; + tn.level=SUBSPECIES; + tn.setOriginalLevel(STRAIN_E); + strains++; + } + } + } + +// outstream.println("Assigned "+strains+" strains."); + for(TaxNode tn : prokList){ + if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ + TaxNode parent=nodes[tn.pid]; + if(parent.levelExtended==STRAIN_E){ + tn.levelExtended=SUBSTRAIN_E; + tn.level=SUBSPECIES; + tn.setOriginalLevel(SUBSTRAIN_E); + substrains++; + } + } + } +// outstream.println("Assigned "+substrains+" substrains."); + } + + @Deprecated + private void assignStrainsOld(){ + + outstream.println("Assigning strains."); + int strains=0, substrains=0; + TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed + assert(bacteria.name.equalsIgnoreCase("Bacteria")); + for(TaxNode tn : nodes){ + if(tn!=null && tn.originalLevel()==NO_RANK){ + TaxNode parent=nodes[tn.pid]; + if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){ +// nodesPerLevelExtended[STRAIN_E]++; +// nodesPerLevelExtended[tn.levelExtended]--; + tn.levelExtended=STRAIN_E; + tn.level=SUBSPECIES; + tn.setOriginalLevel(STRAIN_E); + strains++; + } + } + } +// outstream.println("Assigned "+strains+" strains."); + for(TaxNode tn : nodes){ + if(tn!=null && tn.originalLevel()==NO_RANK){ + TaxNode parent=nodes[tn.pid]; + if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){ +// nodesPerLevelExtended[SUBSTRAIN_E]++; +// nodesPerLevelExtended[tn.levelExtended]--; + tn.levelExtended=SUBSTRAIN_E; + tn.level=SUBSPECIES; + tn.setOriginalLevel(SUBSTRAIN_E); + substrains++; + } + } + } +// outstream.println("Assigned "+substrains+" substrains."); + } + + /*--------------------------------------------------------------*/ + /*---------------- Construction ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Create tax nodes using names in the designated file. + * @param fname NCBI names.txt + * @return Array of created nodes, where array[x] contains the node with TaxID x. + */ + private static TaxNode[] getNames(String fname){ + ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000); + int max=0; + + TextFile tf=new TextFile(fname, false); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.contains("scientific name")){ + String[] split=delimiter.split(s, 3); + assert(split.length==3) : s; + int id=Integer.parseInt(split[0]); + String name=split[1]; + if(id==1 && name.equalsIgnoreCase("root")){name="Life";} + max=Tools.max(max, id); + list.add(new TaxNode(id, name)); + } + } + + TaxNode[] nodes=new TaxNode[max+1]; + for(TaxNode n : list){ + assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n; + nodes[n.id]=n; + } + + return nodes; + } + + /** + * Parses names file a second time to fill in additional information. + * Should really be merged into getNames. + * @TODO Merge into getNames + */ + private static TaxNode[] getNodes(String fname, TaxNode[] nodes){ + + int max=0; + + LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>(); + + TextFile tf=new TextFile(fname, false); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + String[] split=delimiter.split(s, 4); + assert(split.length==4) : s; + int id=-1, pid=-1, level=-1, levelExtended=-1; + + id=Integer.parseInt(split[0]); + try { + pid=Integer.parseInt(split[1]); + } catch (NumberFormatException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.err.println("Bad line: "+s+"\n"+Arrays.toString(split)); + } + boolean alt=false; + { + String key=split[2]; + Integer obj0=levelMap.get(key); + Integer obj=levelMapExtended.get(key); + assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split); + + if(obj0==null){ + obj0=altLevelMap.get(key); + alt=true; + } + if(obj0!=null){ + level=obj0; + levelExtended=obj; + if(id==pid){ + level=LIFE; + levelExtended=LIFE_E; + alt=false; + } + }else{ + if(id==pid){ + level=LIFE; + levelExtended=LIFE_E; + alt=false; + }else{ + int[] count=oddNames.get(key); + if(count==null){ + count=new int[1]; + oddNames.put(key, count); + } + count[0]++; + } + } + } + max=Tools.max(max, id); + TaxNode n=nodes[id]; + assert(n!=null && n.pid<0) : n+" -> "+s; + n.pid=pid; + n.level=level; + n.levelExtended=levelExtended; + n.setOriginalLevel(levelExtended); + n.setCanonical(!alt); + assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n"; + } + + if(oddNames.size()>0){ + outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:"); + if(verbose){ + for(String s : oddNames.keySet()){ + outstream.println(oddNames.get(s)[0]+"\t"+s); + } + } + } + + return nodes; + } + + /** + * Count child nodes of each node. + * This can be used to size arrays or determine which nodes are leaves. + */ + private void countChildren(){ + for(TaxNode child : nodes){ + if(child!=null && child.pid!=child.id){ + TaxNode parent=getNode(child.pid); + if(parent!=child){parent.numChildren++;} + } + } + } + + //TODO - This could be finished in 2 passes using the childTable. + /** + * Fill derived fields minParentLevelExtended and maxChildLevelExtended + * by percolating information through the tree until a fixpoint is reached. + * @TODO This could be finished in 2 passes using the childTable. + * @return Number of rounds required to reach fixpoint. + */ + private int percolate(){ + boolean changed=true; + int rounds=0; + while(changed){ + changed=false; + rounds++; + for(TaxNode child : nodes){ + if(child!=null && child.pid!=child.id){ + TaxNode parent=getNode(child.pid); + changed=(child.discussWithParent(parent) | changed); + } + } + + if(!changed){break;} + changed=false; + rounds++; + for(int i=nodes.length-1; i>=0; i--){ + TaxNode child=nodes[i]; + if(child!=null && child.pid!=child.id){ + TaxNode parent=getNode(child.pid); + changed=(child.discussWithParent(parent) | changed); + } + } + } + return rounds; + } + + /** + * Load nodes into the nameMap and nameMapLower, mapped to their names. + * @param genusDotSpecies Also hash abbreviations such as E.coli. + */ + public synchronized void hashNames(boolean genusDotSpecies){ + if(nameMap!=null){return;} + assert(nameMap==null); + assert(nameMapLower==null); + final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN)); + nameMap=new HashMap<String, ArrayList<TaxNode>>(size); + nameMapLower=new HashMap<String, ArrayList<TaxNode>>(size); + + //Hash the names, both lowercase and uppercase + for(TaxNode n : nodes){ + if(n!=null){ + String name=n.name; + if(name.indexOf('_')>=0){ + name=name.replace('_', ' ').trim(); + } + if(name!=null && !name.equals("environmental samples")){ + { + ArrayList<TaxNode> list=nameMap.get(name); + if(list==null){ + list=new ArrayList<TaxNode>(); + nameMap.put(name, list); + } + list.add(n); + } + { + String lc=name.toLowerCase(); + ArrayList<TaxNode> list=nameMapLower.get(lc); + if(list==null){ + list=new ArrayList<TaxNode>(); + nameMapLower.put(lc, list); + } + list.add(n); + } + } + } + } + + //Hash G.species versions of the names, both lowercase and uppercase + if(genusDotSpecies){ + ByteBuilder bb=new ByteBuilder(64); + for(TaxNode n : nodes){ + if(n!=null && n.levelExtended==SPECIES_E){ + String name=n.name; + if(name.indexOf('_')>=0){ + name=name.replace('_', ' ').trim(); + } + if(name!=null && !name.equals("environmental samples")){ + final String dotFormat=dotFormat(name, bb); + if(dotFormat!=null){ + { + ArrayList<TaxNode> list=nameMap.get(dotFormat); + if(list==null){ + list=new ArrayList<TaxNode>(); + nameMap.put(dotFormat, list); + } + list.add(n); + } + { + String lc=dotFormat.toLowerCase(); + ArrayList<TaxNode> list=nameMapLower.get(lc); + if(list==null){ + list=new ArrayList<TaxNode>(); + nameMapLower.put(lc, list); + } + list.add(n); + } + } + } + } + } + } + } + + /** + * Generate the "dot format" name of a node. + * For example, transform "Homo sapiens" to "H.sapiens" + * @param name Node name + * @param buffer A ByteBuilder that may be modified + * @return Dot format + */ + private static String dotFormat(String name, ByteBuilder buffer){ + if(name==null || name.indexOf('.')>=0){return null;} + final int firstSpace=name.indexOf(' '); + if(firstSpace<0 || firstSpace>=name.length()-1){return null;} + final int lastSpace=name.lastIndexOf(' '); + if(firstSpace!=lastSpace){return null;} + final String a=name.substring(0, firstSpace); + final String b=name.substring(lastSpace+1); + final char ca=a.charAt(0); + final char cb=b.charAt(0); + if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;} + if(buffer==null){buffer=new ByteBuilder(2+b.length());} + else{buffer.clear();} + buffer.append(ca).append('.').append(b); + return buffer.toString(); + } + + /** + * Fill childMap, which maps nodes to their children. + */ + public synchronized void hashChildren(){ + assert(childMap==null); + int nodesWithChildren=0; + for(TaxNode tn : nodes){ + if(tn!=null && tn.numChildren>0){nodesWithChildren++;} + } + childMap=new HashMap<TaxNode, ArrayList<TaxNode>>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN)); + for(TaxNode tn : nodes){ + if(tn!=null){ + if(tn.numChildren>0){ + childMap.put(tn, new ArrayList<TaxNode>(tn.numChildren)); + } + } + } + for(TaxNode tn : nodes){ + if(tn!=null){ + if(tn.id!=tn.pid){ + ArrayList<TaxNode> list=childMap.get(getNode(tn.pid)); + if(list!=null){list.add(tn);} + } + } + } + } + + /** + * Fetch this node's children. + * @param parent Node in question + * @return List of child nodes + */ + public ArrayList<TaxNode> getChildren(TaxNode parent){ + if(parent.numChildren<1){return null;} + if(childMap!=null){return childMap.get(parent);} + ArrayList<TaxNode> list=new ArrayList<TaxNode>(parent.numChildren); + for(TaxNode tn : nodes){ + if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){ + list.add(tn); + } + } + return list; + } + + /** + * Load a map of old to new TaxIDs. + * @param mergedFile NCBI merged.txt. + * @return Map of old to new TaxIDs + */ + private static IntHashMap getMerged(String mergedFile) { + if(mergedFile==null){return null;} + String[] lines=TextFile.toStringLines(mergedFile); + if(lines.length<1){return null;} + IntHashMap map=new IntHashMap((int)(lines.length*1.3)); + for(String line : lines){ + String[] split=delimiterTab.split(line); + int a=Integer.parseInt(split[0]); + int b=Integer.parseInt(split[2]); + map.put(a, b); + } + return map; + } + + /** + * Simplify the tree by assigning ranks to unranked nodes, + * where possible, through inference. + * Optionally removes unranked nodes based on the skipNorank field. + * @param nodes Array of all TaxNodes. + * @return Number of nodes removed. + */ + private int simplify(TaxNode nodes[]){ + + int failed=test(nodes); + + int removed=0; + int reassigned=0; + + if(reassign){ + boolean changed=true; + int changedCount=0; + while(changed){ + changed=false; + for(int i=0; i<nodes.length; i++){ + TaxNode n=nodes[i]; + if(n!=null && n.levelExtended<1){ + int pid=n.pid; + TaxNode parent=nodes[pid]; + assert(parent!=null) : n; + if(n.maxDescendantLevelIncludingSelf()<SUBSPECIES_E && + parent.minAncestorLevelIncludingSelf()<=SPECIES_E && parent.minAncestorLevelIncludingSelf()>=SUBSPECIES_E){ +// if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ + changed=true; + n.levelExtended=SUBSPECIES_E; + n.level=SUBSPECIES; + changedCount++; + } + } + } + } + System.err.println("Assigned levels to "+changedCount+" unranked nodes."); + } + + + if(skipNorank){//Skip nodes with unknown taxa + if(verbose){outstream.println("A0");} + + for(int i=0; i<nodes.length; i++){ + TaxNode n=nodes[i]; + if(n!=null){ + int pid=n.pid; + TaxNode parent=nodes[pid]; + assert(parent!=null) : n; + assert(parent!=n || pid==1) : n+", "+pid; + while(parent.levelExtended<1 && n.levelExtended>parent.levelExtended){ + //System.err.println("Reassigned from "+parent); + assert(parent.id!=parent.pid); + parent=nodes[parent.pid]; + n.pid=parent.id; + reassigned++; + } + } + } + + for(int i=0; i<nodes.length; i++){ + if(nodes[i]!=null && nodes[i].levelExtended<0){ + System.err.println("Removed "+nodes[i]); + nodes[i]=null; + removed++; + } + } + if(verbose){outstream.println("Skipped "+reassigned+" unranked parents, removed "+removed+" invalid nodes.");} + } + + if(inferRankLimit>0){//Infer level for unset nodes (from "no rank") + if(verbose){outstream.println("A");} + int changed=1; + while(changed>0){ + changed=0; + for(final TaxNode n : nodes){ + if(n!=null){ + if(n.levelExtended==0){ + TaxNode parent=nodes[n.pid]; + if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){ + n.levelExtended=Tools.max(1, parent.levelExtended-1); + assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit); + changed++; + } + } + } + } + if(verbose){outstream.println("changed: "+changed);} + } + +// outstream.println("B"); +// for(TaxNode n : nodes){ +// if(n!=null && n.level==0){ +// n.level=-1; +// } +// } + } + + failed=test(nodes); + +// if(reassign){//Skip nodes with duplicate taxa +// if(verbose){outstream.println("D");} +// int changed=1; +// while(changed>0){ +// changed=0; +// for(final TaxNode n : nodes){ +// if(n!=null){ +// TaxNode parent=nodes[n.pid]; +// TaxNode grandparent=nodes[parent.pid]; +// assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent; +// assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent; +// +// while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) || +// n.level>parent.level || (n.level==parent.level))){ +// parent=grandparent; +// grandparent=nodes[parent.pid]; +// n.pid=parent.id; +// reassigned++; +// changed++; +// } +// } +// } +// if(verbose){outstream.println("changed: "+changed);} +// } +// if(verbose){outstream.println("E");} +// for(int i=0; i<nodes.length; i++){ +// if(nodes[i]!=null && nodes[i].level<0){ +// nodes[i]=null; +// removed++; +// } +// } +// } + + failed=test(nodes); + + if(verbose){outstream.println("F");} + {//Ensure the tree is now clean + for(int i=0; i<nodes.length; i++){ + TaxNode n=nodes[i]; + if(n!=null){ + TaxNode parent=nodes[n.pid]; + TaxNode grandparent=nodes[parent.pid]; + assert(n==parent || n.levelExtended<=parent.levelExtended || !n.canonical() || n.levelExtended<1 || parent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; + assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; + } + } + } + +// if(verbose){System.err.println("Reassignments: "+reassigned);} + + return removed; + } + + /*--------------------------------------------------------------*/ + /*---------------- Validation ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Ensure tree has monotonically increasing (or nondescending) ranks. + * @param nodes All TaxNodes. + * @return Number of violations. + */ + private static int test(TaxNode[] nodes){ + int failed=0; + for(final TaxNode n : nodes){ + if(n!=null){ + TaxNode parent=nodes[n.pid]; + try { + assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) : + "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n" + + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended; + assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent; +// assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical() || !parent.canonical()) : n+" -> "+parent; + if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){ + if(verbose){outstream.println("Error: "+n+" -> "+parent);} + failed++; + }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){ +// if(verbose){outstream.println("Error: "+n+" -> "+parent);} +// failed++; + } + assert(n!=parent || n.id<=1) : n; + } catch (Throwable e) { + // TODO Auto-generated catch block + e.printStackTrace(); + failed++; + } + } + } + if(verbose || failed>0){outstream.println(failed+" nodes failed.");} + return failed; + } + + /*--------------------------------------------------------------*/ + /*---------------- Print Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Format full name in semicolon format, e.g. + * "SK:Bacteria;P:Protobacteria;..." + * @param tn0 Base node + * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. + * @return Resultant String + */ + public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){ + StringBuilder sb=new StringBuilder(); + if(tn0==null){return "Not found";} + String semi=""; + ArrayList<TaxNode> list=toAncestors(tn0, skipNonCanonical); + boolean addTaxLevel=true; + for(int i=list.size()-1; i>=0; i--){ + sb.append(semi); + TaxNode tn=list.get(i); + if(tn.id!=LIFE_ID || list.size()==1){ + if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){ + sb.append(tn.levelToStringShort()).append(':'); + } + sb.append(mononomial ? mononomial(tn) : tn.name); + semi=";"; + } + } + return sb.toString(); + } + + /** + * Return a list of TaxIDs of all ancestors. + * @param tn0 Base node + * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. + * @return List of TaxIDs. + */ + public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){ + if(tn0==null){return null;} + IntList list=new IntList(8); + + TaxNode tn=tn0; + while(tn!=null){ + if(!skipNonCanonical || tn.isSimple()){ + if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);} + } + if(tn.pid==tn.id){break;} + tn=getNode(tn.pid); + } + if(list.isEmpty()){list.add(tn0.id);} + return list; + } + + /** + * Return a list of all ancestors. + * @param tn0 Base node + * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. + * @return List of ancestor nodes. + */ + public ArrayList<TaxNode> toAncestors(final TaxNode tn0, boolean skipNonCanonical){ + if(tn0==null){return null;} + ArrayList<TaxNode> list=new ArrayList<TaxNode>(8); + + TaxNode tn=tn0; + while(tn!=null){ + if(!skipNonCanonical || tn.isSimple()){ + if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);} + } + if(tn.pid==tn.id){break;} + tn=getNode(tn.pid); + } + if(list.isEmpty()){list.add(tn0);} + return list; + } + + /** + * Generate a path to the genome of an organism on the filesystem; + * used by ExplodeTree. Intended for internal JGI use. + * @param root Location of the exploded tree. + * @return Path to a genome. + */ + public String toDir(TaxNode node, String root){ + StringBuilder sb=new StringBuilder(); + if(root==null){root="";} + sb.append(root); + if(root.length()>0 && !root.endsWith("/")){sb.append('/');} + IntList list=toAncestorIds(node, false); + list.reverse(); + assert(list.get(0)==1) : list + "," +getNode(list.get(0)); + for(int i=0; i<list.size(); i++){ + sb.append(list.get(i)); + sb.append('/'); + } + return sb.toString(); + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Use various techniques to get a TaxID from an unknown String, such as parsing, + * name lookups, and accession translation. + * @param s String to process. + * @return Decoded TaxID. + */ + public static int getID(String s){return GiToTaxid.getID(s);} + + /** + * Use various techniques to get a TaxID from an unknown byte[], such as parsing, + * name lookups, and accession translation. + * @param s String to process. + * @return Decoded TaxID. + */ + public static int getID(byte[] s){return GiToTaxid.getID(s);} + + /** Return the lowest ancestor of the named node with taxonomic level at least minLevel */ + public TaxNode getNode(String s, int minLevelExtended){ + TaxNode tn=parseNodeFromHeader(s, true); + while(tn!=null && tn.levelExtended<minLevelExtended && tn.pid!=tn.id){ + tn=getNode(tn.pid); + } + return tn; + } + + /** + * Determine whether a node is a descendant of another. + * @param child Possible child TaxID. + * @param parent Possible parent TaxID. + * @return true iff child descends from parent. + */ + public boolean descendsFrom(final int child, final int parent){ + TaxNode cn=getNode(child), pn=getNode(parent); + assert(cn!=null) : "Invalid taxID: "+child; + assert(pn!=null) : "Invalid taxID: "+parent; + return descendsFrom(cn, pn); + } + + /** + * Determine whether a node is a descendant of another. + * @param child Possible child node. + * @param parent Possible parent node. + * @return true iff child descends from parent. + */ + public boolean descendsFrom(TaxNode child, TaxNode parent){ + assert(child!=null && parent!=null) : "Null parameters."; + if(child==null || parent==null){return false;} + + while(child!=parent && child.levelExtended<=parent.levelExtended && child.id!=child.pid){ + child=getNode(child.pid); + } + return child==parent; + } + + /** + * Determine whether an organism is classified as X. + * @param taxID taxID of organism. + * @param ancestorID taxID of possible ancestor. + * @return true if the organism is an X. + */ + public boolean descendsFrom2(int taxID, final int ancestorID) { + TaxNode tn=getNode(taxID); + while(tn.id!=tn.pid){ + if(tn.id==ancestorID){return true;} + tn=getNode(tn.pid); + } + return false; + } + + /** Determine whether an organism is classified as a plant. */ + public boolean isPlant(int taxID) {return descendsFrom2(taxID, VIRIDIPLANTAE_ID);} + + /** Determine whether an organism is classified as an animal. */ + public boolean isAnimal(int taxID) {return descendsFrom2(taxID, METAZOA_ID);} + + /** Determine whether an organism is classified as a fungus. */ + public boolean isFungus(int taxID) {return descendsFrom2(taxID, FUNGI_ID);} + + /** Determine whether an organism is classified as a eukaryote. */ + public boolean isEukaryote(int taxID) {return descendsFrom2(taxID, EUKARYOTA_ID);} + + /** Determine whether an organism is classified as a prokaryote. */ + public boolean isProkaryote(int taxID) { + TaxNode tn=getNode(taxID); + if(tn==null){ + System.err.println("*** Warning: Can't find node "+taxID+" ***"); + return false; + } + while(tn.id!=tn.pid){ + if(tn.id==BACTERIA_ID || tn.id==ARCHAEA_ID){return true;} + tn=getNode(tn.pid); + } + return false; + } + + /** + * Calculate the common ancestor of two nodes. + * @param a TaxID of a node. + * @param b TaxID of a node. + * @return Common ancestor ID of a and b. + */ + public int commonAncestor(final int a, final int b){ + TaxNode an=getNode(a), bn=getNode(b); + assert(an!=null) : "Invalid taxID: "+a; + assert(bn!=null) : "Invalid taxID: "+b; + TaxNode cn=commonAncestor(an, bn); + assert(cn!=null) : "No common ancestor: "+an+", "+bn; + if(cn==null){return -1;} + return cn.id; + } + + /** + * Calculate the common ancestor of two nodes. + * @param a A node. + * @param b A node. + * @return Common ancestor of a and b. + */ + public TaxNode commonAncestor(TaxNode a, TaxNode b){ + assert(a!=null && b!=null) : "Null parameters."; + if(a==null){return b;} + if(b==null){return a;} + + while(a!=b){ + if(a.levelExtended<b.levelExtended){ + a=getNode(a.pid); + }else{ + b=getNode(b.pid); + } + } + return a; + } + + /** + * Identify the highest ancestor of a node; + * this will presumably be "Life". + * @param a Node + * @return Highest ancestor + */ + public TaxNode highestAncestor(TaxNode a){ + assert(a!=null); + while(a.id!=a.pid){a=getNode(a.pid);} + return a; + } + + /*--------------------------------------------------------------*/ + /*---------------- Header Parsing ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Determine the TaxID of a String, + * without a loaded TaxTree. + * This only works if the literal TaxID is embedded in the String. + * @param header Typically a sequence header + * @return Decoded TaxID, or -1 if unsuccessful + */ + public static int parseHeaderStatic(String header){ + if(header.length()<3){return -1;} + if(header.charAt(0)=='>'){header=header.substring(1);} + if(!header.startsWith("tid|")){return -1;} + int idx=3; + int idx2=header.indexOf('|', 4); + if(idx2<5){return -1;} + int id=-1; + try { + id=Parse.parseInt(header, idx+1, idx2); +// System.err.println("d"+", "+header.substring(idx+1, idx2)); + } catch (Throwable e) { +// System.err.println("e"+", "+header.substring(idx+1, idx2)); + //ignore + } + return id; + } + + /** + * Determine the TaxID of a String. + * @param header Typically a sequence header + * @param bestEffort In some cases, try certain substrings if the name is not found. + * @return + */ + public TaxNode parseNodeFromHeader(String header, boolean bestEffort){ + if(header==null || header.length()<2){return null;} + if(header.charAt(0)=='>'){header=header.substring(1);} + TaxNode tn; + if(SILVA_MODE){ + tn=getNodeSilva(header, bestEffort); + }else if(UNITE_MODE){ + tn=getNodeUnite(header, bestEffort); + }else{ + final char delimiter=ncbiHeaderDelimiter(header); + if(delimiter==' '){ + tn=getNodeNewStyle(header); + }else{ + tn=getNodeOldStyle(header, delimiter); + if(tn==null && delimiter=='|'){ +// System.err.println("A: "+header); + int id=-1; + String[] split=header.split("\\|"); + if(AccessionToTaxid.LOADED()){ + for(int i=0; i<split.length && id<0; i++){//Try accessions first + if(AccessionToTaxid.isValidAccession(split[i])){ + id=AccessionToTaxid.get(split[i]); + } + } + } + for(int i=0; i<split.length && id<0; i++){//Then names + id=parseNameToTaxid(split[i]); + } +// System.err.println("E: "+id); + if(id>=0){tn=getNode(id);} +// System.err.println("F: "+tn); + } + } + } + return tn; + } + + /** + * Guess the delimiter character in a String; + * typically assumed to be '|', '~', or ' '. + */ + public static char ncbiHeaderDelimiter(String header){ + for(int i=0; i<header.length(); i++){ + final char c=header.charAt(i); + if(c=='|' || c=='~'){ + assert(i>0) : "i="+i+"; malformatted header '"+header+"'"; + return c; + }else if(Character.isWhitespace(c)){ + return ' '; + } + } + return ' '; + } + + /** + * Parse a Silva header to a Node. + * @param s Silva header. + * @param bestEffort Try certain substrings if the name is not found. + * @return Node + */ + TaxNode getNodeSilva(String s, boolean bestEffort){ + if(s==null){return null;} + if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ + return getNodeOldStyle(s, s.charAt(3)); + } + String[] split=Tools.semiPattern.split(s); + + int number=-1; +// final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast")); +// if(chloroplast){return null;} + for(int i=split.length-1; number<0 && i>=0; i--){ + String last=split[i]; + int paren=last.indexOf('('); + if(paren>=0){last=last.substring(0, paren);} + last=last.trim(); + + if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){ + number=parseNameToTaxid(last); + } + + if(number>=0){return getNode(number);} + else if(!bestEffort){break;} + } + return null; + } + + /** + * Parse a Unite header to a Node. + * @param s Unite header. + * @param bestEffort Try certain substrings if the name is not found. + * @return Node + */ + TaxNode getNodeUnite(String s, boolean bestEffort){ + if(s==null){return null;} + if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ + return getNodeOldStyle(s, s.charAt(3)); + } + String[] split=Tools.pipePattern.split(s); + + int number=-1; + String name=split[0]; + String acc=split[1]; + if(AccessionToTaxid.LOADED() && acc.length()>0){ + number=AccessionToTaxid.get(acc); + } + if(number<1){ + TaxNode tn=getNodeByName(name); + if(tn!=null){number=tn.id;} + } + + if(number>=0){return getNode(number);} + return null; + } + + /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */ + private TaxNode getNodeOldStyle(final String s, char delimiter){ + { + int index=s.indexOf(delimiter); + if(index<0){ + delimiter='~'; + index=s.indexOf(delimiter); + if(index<0){ + delimiter='_'; + index=s.indexOf(delimiter); + } + } + int number=-1; + + Throwable e=null; + + if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){ +// System.err.println("Parsing gi number."); + + if(GiToTaxid.isInitialized()){ + try { + number=GiToTaxid.parseGiToTaxid(s, delimiter); + } catch (Throwable e2) { + e=e2; + } + }else{ + assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s; + } +// if(number!=-1){System.err.println("number="+number);} + }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){ +// System.err.println("Parsing ncbi number."); + number=GiToTaxid.parseTaxidNumber(s, delimiter); + }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){ +// System.err.println("Parsing ncbi number."); + long img=parseDelimitedNumber(s, delimiter); + ImgRecord record=imgMap.get(img); + number=(record==null ? -1 : record.taxID); + }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete +// System.err.println("Parsing ncbi number."); + number=GiToTaxid.parseTaxidNumber(s, delimiter); + } + + if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){ + String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s)); + if(AccessionToTaxid.LOADED()){ + number=parseAccessionToTaxid(split); + } + if(number<0){ + number=parseHeaderNameToTaxid(split); + } + } + + if(number<0 && e!=null){ + assert(false) : e; + throw new RuntimeException(e); + } + + //TaxServer code could go here... + + if(number>=0){return getNode(number);} + } + if(verbose){System.err.println("Can't process name "+s);} + if(Tools.isDigit(s.charAt(0)) && s.length()<=9){ + try { + return getNode(Integer.parseInt(s)); + } catch (NumberFormatException e) { + //ignore + } + } + return null; + } + + /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */ + static long parseDelimitedNumber(String s, char delimiter){ + if(s==null){return -1;} + int i=0; + while(i<s.length() && s.charAt(i)!=delimiter){i++;} + i++; + if(i>=s.length() || !Tools.isDigit(s.charAt(i))){return -1;} + + long number=0; + while(i<s.length()){ + char c=s.charAt(i); + if(c==delimiter || c==' ' || c=='\t'){break;} + assert(Tools.isDigit(c)) : c+"\n"+s; + number=(number*10)+(c-'0'); + i++; + } + return number; + } + + /** Parses sequence headers using NCBI's current header system, with Accessions. */ + private TaxNode getNodeNewStyle(final String s){ + + int space=s.indexOf(' '); + int number=-1; + + if(AccessionToTaxid.LOADED()){ + if(space>0){ + number=AccessionToTaxid.get(s.substring(0, space)); + }else{ + number=AccessionToTaxid.get(s); + } + } + + if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){ + try { + return getNode(Integer.parseInt(s)); + } catch (NumberFormatException e) { + //ignore + } + } + + if(number<0 && space>0){ + number=parseNameToTaxid(s.substring(space+1)); + } + + if(number>-1){return getNode(number);} + if(space<0 && s.indexOf('_')>0){ + return getNodeNewStyle(s.replace('_', ' ')); + } + return null; + } + + /** + * For parsing old-style NCBI headers. + */ + public int parseAccessionToTaxid(String[] split){ + if(split.length<4){ + return -1; + } + int ncbi=AccessionToTaxid.get(split[3]); + return ncbi; + } + + /** + * For parsing old-style NCBI headers. + */ + public int parseHeaderNameToTaxid(String[] split){ + if(split.length<5){ + return -1; + } + return parseNameToTaxid(split[4]); + } + + /** + * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens"). + * If multiple nodes share the same name, returns the first; to get the full list, + * use getNodesByNameExtended. + * @param name Organism name. + * @return Organism TaxID, or -1 if not found. + */ + public int parseNameToTaxid(String name){ +// assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size()); + List<TaxNode> list=null; + + list=getNodesByNameExtended(name); + + if(list==null || list.size()>1){return -1;} + return list.get(0).id; + } + + /** + * Fetch nodes indicated by this name. + * @param name A taxonomic name delimited by space or underscore. + * @return Nodes corresponding to the name. + */ + public List<TaxNode> getNodesByNameExtended(String name){ + List<TaxNode> list=null; + + list=getNodesByName(name); + if(list!=null){return list;} + + name=name.replaceAll("_", " ").trim(); + list=getNodesByName(name); + if(list!=null){return list;} + + String[] split2=name.split(" "); + + if(split2.length>7){ + String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7]; + list=getNodesByName(term); +// System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + + if(split2.length>6){ + String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]; + list=getNodesByName(term); +// System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + + if(split2.length>5){ + String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]; + list=getNodesByName(term); +// System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + if(split2.length>4){ + String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]; + list=getNodesByName(term); +// System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + if(split2.length>3){ + String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]; + list=getNodesByName(term); +// System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + if(split2.length>2){ + String term=split2[0]+" "+split2[1]+" "+split2[2]; + list=getNodesByName(term); +// System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + if(split2.length>1){ + String term=split2[0]+" "+split2[1]; + list=getNodesByName(term); +// System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + if(split2.length>0){ + String term=split2[0]; + list=getNodesByName(term); +// System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); + if(list!=null){return list;} + } + + return null; + } + + /*--------------------------------------------------------------*/ + /*---------------- Assorted Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Return the TaxID of the lowest ancestor node at least the specified level, + * including this node itself. Level is the normal (non-extended) level. + * @param taxID + * @param taxLevel + * @return + */ + public int promote(final int taxID, int taxLevel){ + TaxNode tn=null; + tn=(taxID<1 ? null : getNode(taxID)); + tn=promote(tn, taxLevel); + return (tn==null ? taxID : tn.id); + } + + /** + * Fetch the first node in this node's lineage of at least the indicated level. + * This can be the node itself or an ancestor. + * @see getNodeAtLevelExtended + * @param tn Node in question + * @param taxLevel Desired minimum level + * @return A node at the desired level + */ + public TaxNode promote(TaxNode tn, int taxLevel){ + while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){ + TaxNode temp=getNode(tn.pid); + if(temp==null || temp==tn || temp.level>=TaxTree.LIFE || temp.level>taxLevel){break;} + tn=temp; + } + return tn; + } + + /** + * Determine the TaxID of the node's parent. + * @param id TaxID of child node + * @return Parent TaxID + */ + public int getParentID(int id){ + assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." + + "\nPlease redownload it and regenerate the taxtree."; + if(id<0 || id>=nodes.length){return -1;} + TaxNode tn=nodes[id]; + if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);} + return tn==null ? -1 : tn.pid; + } + + /** + * Fetch the node with this TaxID. + * @param id TaxID + * @return Node + */ + public TaxNode getNode(int id){ + assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." + + "\nPlease redownload it and regenerate the taxtree."; + if(id<0 || id>=nodes.length){return null;} + TaxNode tn=nodes[id]; + if(tn!=null || mergedMap==null){return tn;} + return getNode(mergedMap.get(id), true); + } + + /** + * Fetch the node with this TaxID, but don't throw assertions upon failure. + * @param id TaxID + * @return Node + */ + public TaxNode getNode(int id, boolean skipAssertion){ + assert(skipAssertion || id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." + + "\nPlease redownload it and regenerate the taxtree."; + if(id<0 || id>=nodes.length){return null;} + TaxNode tn=nodes[id]; + if(tn!=null || mergedMap==null){return tn;} + return getNode(mergedMap.get(id), true); + } + + public TaxNode getNodeAtLevel(int id, int minLevel){ + return getNodeAtLevel(id, minLevel, DOMAIN); + } + + public TaxNode getNodeAtLevelExtended(int id, int minLevelE){ + return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E); + } + + public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){ + final int minLevelExtended=levelToExtended(minLevel); + final int maxLevelExtended=levelToExtended(maxLevel); + return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended); + } + + public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){ + TaxNode tn=getNode(id); + while(tn!=null && tn.pid!=tn.id && tn.levelExtended<minLevelE){ + TaxNode temp=getNode(tn.pid); + if(temp==null || temp.levelExtended>maxLevelE){break;} + tn=temp; + } + return tn; + } + + public int getIdAtLevelExtended(int taxID, int taxLevelExtended){ + if(taxLevelExtended<0){return taxID;} + TaxNode tn=getNode(taxID); + while(tn!=null && tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){ + tn=getNode(tn.pid); + if(tn.levelExtended>taxLevelExtended){break;} + taxID=tn.id; + } + return taxID; + } + + /** + * Fetch the node with this name. + * Throw an assertion if there are multiple such nodes. + * @param s Organism name. + * @return Node with given name. + */ + public TaxNode getNodeByName(String s){ + List<TaxNode> list=getNodesByName(s, false); + if(list==null){list=getNodesByName(s, true);} + if(list==null || list.isEmpty()){return null;} + if(list.size()==1){return list.get(0);} + assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n"; + TaxNode a=list.get(0); + for(int i=1; i<list.size(); i++){ + TaxNode b=list.get(i); + //Keep the most specific node +// if(a==null || (b!=null && b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf())){//not necessary + if(b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf()){ + a=b; + } + } + return a; + } + + /** + * Fetch a list of all nodes with this name. + * @param s Organism name. + * @return Nodes with given name. + */ + public List<TaxNode> getNodesByName(String s){ + List<TaxNode> list=getNodesByName(s, false); + if(list==null){list=getNodesByName(s, true);} + return list; + } + + /** + * Fetch a map of names to nodes. If absent, create it first. + * @param lowercase If true, return the map with lowercase keys. + * @return Map of names to nodes. + */ + private HashMap<String, ArrayList<TaxNode>> getMap(boolean lowercase){ + HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap); + if(map==null){ + synchronized(this){hashNames(true);} + map=(lowercase ? nameMapLower : nameMap); + } + assert(map!=null) : "Tax names were not hashed."; + return map; + } + + private List<TaxNode> getNodesByName(String s, boolean lowercase){ + if(s==null){return null;} + if(s.indexOf('_')>=0){s=s.replace('_', ' ');} + if(lowercase){s=s.toLowerCase();} +// System.err.println("Searching for "+s); + final HashMap<String, ArrayList<TaxNode>> map=getMap(lowercase); + ArrayList<TaxNode> list=map.get(s); + if(list!=null){return list;} +// System.err.println("No matches for '"+s+"'"); + +// assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s); + + if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;} + String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8); +// System.err.println("Array: "+Arrays.toString(split)); + list=map.get(split[split.length-1]); + if(list==null){return list;} +// System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")"); + + int matchCount=0; + for(TaxNode tn : list){ + if(tn.matchesName(split, split.length-1, this)){matchCount++;} + } + if(matchCount==list.size()){return list;} + if(matchCount<1){return null;} + ArrayList<TaxNode> hits=new ArrayList<TaxNode>(matchCount); + for(TaxNode tn : list){ + if(tn.matchesName(split, split.length-1, this)){hits.add(tn);} + } + return hits; + } + public ArrayList<TaxNode> getAncestors(int id){ + TaxNode current=getNode(id); + ArrayList<TaxNode> list=new ArrayList<TaxNode>(); + while(current!=null && current.pid!=current.id){//ignores root + list.add(current); + current=getNode(current.pid); + } + //optionally add root here + return list; + } + + public void increment(IntList ids, IntList counts, boolean sync){ + + ids.sort(); + ids.getUniqueCounts(counts); + + if(!sync){ + for(int i=0; i<ids.size; i++){ + int id=ids.get(i); + int count=counts.get(i); + incrementRaw(id, count); + } + }else{ + synchronized(this){ + for(int i=0; i<ids.size; i++){ + int id=ids.get(i); + int count=counts.get(i); + incrementRaw(id, count); + } + } + } + } + + public void incrementRaw(int id, long amt){ + assert(id>=0 && id<nodes.length) : "TaxID "+id+" is out of range."+(id<0 ? "" : " Possibly the taxonomy data needs to be updated."); + assert(nodes[id]!=null) : "No node for TaxID "+id+"; possibly the taxonomy data needs to be updated."; + nodes[id].incrementRaw(amt); + } + + public void percolateUp(){ + for(int i=0; i<treeLevelsExtended.length; i++){ + percolateUp(i); + } + } + + public void percolateUp(final int fromLevel){ + final TaxNode[] stratum=treeLevelsExtended[fromLevel]; + for(final TaxNode n : stratum){ + n.incrementSum(n.countRaw); + TaxNode parent=nodes[n.pid]; + if(n!=parent){ + parent.incrementSum(n.countSum); + } + } + } + + /** Add this amount to the node and all its ancestors. */ + public void percolateUp(TaxNode node, long amt){ + if(amt==0){return;} + if(verbose){System.err.println("percolateUp("+amt+") node: "+node);} + while(node.id!=node.pid){ + node.incrementSum(amt); + node=nodes[node.pid]; + } + node.incrementSum(amt); + } + + public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){ + return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1); + } + + public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){ + final int minLevelExtended=levelToExtended(minLevel); + final int maxLevelExtended=levelToExtended(maxLevel); +// assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended; + ArrayList<TaxNode> list=new ArrayList<TaxNode>(); + for(int i=minLevelExtended; i<nodesPerLevelExtended.length && i<=maxLevelExtended; i++){ + list.addAll(gatherNodesAtLeastLimitExtended(i, limit)); + } + Shared.sort(list, TaxNode.countComparator); + return list; + } + + public ArrayList<TaxNode> gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){ + ArrayList<TaxNode> list=new ArrayList<TaxNode>(); + final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended]; + for(final TaxNode n : stratum){ + if(n.countSum>=limit){ + list.add(n); + TaxNode parent=nodes[n.pid]; + if(n!=parent){ + percolateUp(parent, -n.countSum);//123 This was negative for some reason + } + } + } + Shared.sort(list, TaxNode.countComparator); + return list; + } + + /*--------------------------------------------------------------*/ + /*---------------- Static Initializers ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Generate the name to level number map. + */ + private static HashMap<String, Integer> makeLevelMap() { + HashMap<String, Integer> map=new HashMap<String, Integer>(31); + for(int i=0; i<taxLevelNames.length; i++){ + map.put(taxLevelNames[i], i); + map.put(taxLevelNames[i].toUpperCase(), i); + } + map.put("clade", NO_RANK); + map.put("clade".toUpperCase(), NO_RANK); + return map; + } + + /** + * Generate the name to extended level number map. + */ + private static HashMap<String, Integer> makeLevelMapExtended() { + HashMap<String, Integer> map=new HashMap<String, Integer>(129); + for(int i=0; i<taxLevelNamesExtended.length; i++){ + map.put(taxLevelNamesExtended[i], i); + map.put(taxLevelNamesExtended[i].toUpperCase(), i); + } + map.put("clade", NO_RANK_E); + map.put("clade".toUpperCase(), NO_RANK_E); + return map; + } + + /** + * I think this maps normal and extend names to normal level numbers. + */ + private static HashMap<String, Integer> makeAltLevelMap() { + HashMap<String, Integer> map=new HashMap<String, Integer>(129); + for(int i=0; i<taxLevelNames.length; i++){ + map.put(taxLevelNames[i], i); + map.put(taxLevelNames[i].toUpperCase(), i); + } + map.put("clade", NO_RANK); + map.put("clade".toUpperCase(), NO_RANK); + + //Add synonyms +// map.put("subfamily", map.get("family")); +// map.put("tribe", map.get("family")); +// map.put("varietas", map.get("subspecies")); +// map.put("subgenus", map.get("genus")); +// map.put("forma", map.get("subspecies")); +// map.put("species group", map.get("genus")); +// map.put("species subgroup", map.get("genus")); +// map.put("cohort", map.get("class")); +// map.put("subclass", map.get("class")); +// map.put("infraorder", map.get("order")); +// map.put("superorder", map.get("class")); +// map.put("subphylum", map.get("phylum")); +// map.put("infraclass", map.get("class")); +// map.put("superkingdom", map.get("division")); +// map.put("parvorder", map.get("order")); +// map.put("superclass", map.get("phylum")); +// map.put("superphylum", map.get("kingdom")); +// map.put("subkingdom", map.get("kingdom")); +// map.put("superfamily", map.get("order")); +// map.put("superkingdom", map.get("domain")); +// map.put("suborder", map.get("order")); +// map.put("subtribe", map.get("family")); + + for(String[] array : taxLevelNamesExtendedMatrix){ + String head=array[array.length-1]; + Integer value=map.get(head); + assert(value!=null) : head; + for(String key : array){ + if(key!=head){ + assert(!map.containsKey(key)) : "Map already contains key "+key+": "+Arrays.toString(array); + map.put(key, value); + map.put(key.toUpperCase(), value); + } + } + } + + return map; + } + + /*--------------------------------------------------------------*/ + /*---------------- Size ----------------*/ + /*--------------------------------------------------------------*/ + + /** Number of bp associated with this node in RefSeq */ + public long toSize(TaxNode tn){ + if(tn==null){return 0;} + if(refseqSizeMap==null){return -1L;} + final long x=refseqSizeMap.get(tn.id); + return Tools.max(0, x); + } + + /** Number of bp associated with this node and descendants in RefSeq */ + public long toSizeC(TaxNode tn){ + if(tn==null){return 0;} + if(refseqSizeMapC==null){return -1L;} + final long x=refseqSizeMapC.get(tn.id); + return Tools.max(0, x); + } + + /** Number of sequences associated with this node in RefSeq */ + public int toSeqs(TaxNode tn){ + if(tn==null){return 0;} + if(refseqSeqMap==null){return -1;} + final int x=refseqSeqMap.get(tn.id); + return Tools.max(0, x); + } + + /** Number of sequences associated with this node and descandants in RefSeq */ + public long toSeqsC(TaxNode tn){ + if(tn==null){return 0;} + if(refseqSeqMapC==null){return -1L;} + final long x=refseqSeqMapC.get(tn.id); + return Tools.max(0, x); + } + + /** Number of descendants of this node */ + public int toNodes(TaxNode tn){ + if(tn==null){return 0;} + if(nodeMapC==null){return -1;} + final int x=nodeMapC.get(tn.id); + return Tools.max(0, x); + } + + /** + * Fills refseqSizeMap, refseqSizeMapC, etc. from a file containing the summary. + * @param fname Size file name + */ + public void loadSizeFile(String fname){ + if(fname==null){return;} + assert(refseqSizeMap==null); + refseqSizeMap=new IntLongHashMap(); + refseqSizeMapC=new IntLongHashMap(); + refseqSeqMap=new IntHashMap(); + refseqSeqMapC=new IntLongHashMap(); + nodeMapC=new IntHashMap(); + + final ByteFile bf=ByteFile.makeByteFile(fname, true); + final byte delimiter='\t'; + for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ + if(line.length>0 && line[0]!='#'){ + int a=0, b=0; + + while(b<line.length && line[b]!=delimiter){b++;} + assert(b>a) : "Missing field 0: "+new String(line); + int tid=Parse.parseInt(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!=delimiter){b++;} + assert(b>a) : "Missing field 1: "+new String(line); + long size=Parse.parseLong(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!=delimiter){b++;} + assert(b>a) : "Missing field 2: "+new String(line); + long csize=Parse.parseLong(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!=delimiter){b++;} + assert(b>a) : "Missing field 3: "+new String(line); + int seqs=Parse.parseInt(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!=delimiter){b++;} + assert(b>a) : "Missing field 4: "+new String(line); + long cseqs=Parse.parseLong(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!=delimiter){b++;} + assert(b>a) : "Missing field 5: "+new String(line); + int cnodes=Parse.parseInt(line, a, b); + b++; + a=b; + + if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);} + if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);} + if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);} + if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);} + if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);} + } + } + bf.close(); + } + + /*--------------------------------------------------------------*/ + /*---------------- IMG ----------------*/ + /*--------------------------------------------------------------*/ + + public static int imgToTaxid(long img){ + ImgRecord ir=imgMap.get(img); +// assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n"; + return ir==null ? -1 : ir.taxID; + } + + public TaxNode imgToTaxNode(long img){ + int tid=imgToTaxid(img); + return tid<1 ? null : getNode(tid); + } + +// public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){ +// assert(imgMap==null); +// if(fname==null){return 0;} +// ImgRecord2.storeName=storeName; +// if(outstream!=null){System.err.println("Loading IMG.");} +// Timer t=new Timer(outstream, false); +// ImgRecord2[] array=ImgRecord2.toArray(fname); +// int x=loadIMG(array); +// t.stopAndPrint(); +// return x; +// } + + public static int loadIMG(String fname, boolean storeName, PrintStream outstream){ + assert(imgMap==null); + if(fname==null){return 0;} + ImgRecord.storeName=storeName; + if(outstream!=null){System.err.println("Loading IMG.");} + Timer t=new Timer(outstream, false); + ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ); + int x=loadIMG(array); + t.stopAndPrint(); + return x; + } + + public static int loadIMG(ImgRecord[] array){ + assert(imgMap==null); + imgMap=new HashMap<Long, ImgRecord>((int)(array.length*1.5)); + for(ImgRecord record : array){ + imgMap.put(record.imgID, record); + } + return imgMap.size(); + } + + @Deprecated + public static int parseLevel(String b){ + final int level; + if(b==null){level=-1;} + else if(Tools.isNumeric(b.charAt(0))){ + level=Integer.parseInt(b); + }else{ + level=stringToLevel(b.toLowerCase()); + } + return level; + } + + public static int parseLevelExtended(String b){ + final int level; + if(b==null){level=-1;} + else if(Tools.isNumeric(b.charAt(0))){ + level=levelToExtended(Integer.parseInt(b)); + }else{ + level=stringToLevelExtended(b.toLowerCase()); + } + return level; + } + + public boolean isUnclassified(int tid){ + TaxNode tn=getNode(tid); + while(tn!=null && tn.id!=tn.pid){ + if(tn.isUnclassified()){return true;} + if(tn.pid==tn.id){break;} + tn=getNode(tn.pid); + } + return false; + } + + public boolean isEnvironmentalSample(int tid){ + TaxNode tn=getNode(tid); + while(tn!=null && tn.id!=tn.pid){ + if(tn.isEnvironmentalSample()){return true;} + if(tn.pid==tn.id){break;} + tn=getNode(tn.pid); + } + return false; + } + + public boolean isVirus(int tid){ + TaxNode tn=getNode(tid); + while(tn!=null && tn.id!=tn.pid){ + if(tn.id==VIRUSES_ID){return true;} + if(tn.pid==tn.id){break;} + tn=getNode(tn.pid); + } + return false; + } + + public long definedLevels(int tid){ + long levels=0; + TaxNode tn=getNode(tid); + while(tn!=null && tn.id!=tn.pid){ + levels=levels|(1L<<tn.level); + } + return levels; + } + + public long definedLevelsExtended(int tid){ + long levels=0; + TaxNode tn=getNode(tid); + while(tn!=null && tn.id!=tn.pid){ + levels=levels|(1L<<tn.levelExtended); + } + return levels; + } + + /** + * Generates the mononomial name for this taxonomic level based on the scientific name. + * For example, "Homo sapiens" -> "Sapiens" + * @param tid TaxID + * @return Correct name for this node. + */ + public String mononomial(int tid){return mononomial(getNode(tid));} + public String mononomial(TaxNode tn){ + if(tn==null){return null;} + String name=tn.name; + if(name.indexOf(' ')<0){return name;} + TaxNode parent=getNode(tn.pid); + if(parent==null){return name;} + String pname=parent.name; + if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){ + name=name.substring(pname.length()+1); + } + return name; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** All nodes in the tree in a flat array, indexed by TaxiD */ + public final TaxNode[] nodes; + + /** Number of nodes per normal level */ + public final int[] nodesPerLevel=new int[taxLevelNames.length]; + + /** Number of nodes per extended level */ + public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length]; + + /** Number of nodes in the tree */ + public final int nodeCount; + + /** Maps old TaxIDs to new TaxIDs */ + public final IntHashMap mergedMap; + + /** Arrays of all nodes at a given taxonomic level (extended) */ + public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][]; + + /** Map of names to nodes */ + HashMap<String, ArrayList<TaxNode>> nameMap; + /** Map of lowercase names to nodes */ + HashMap<String, ArrayList<TaxNode>> nameMapLower; + /** Map of nodes to child nodes */ + HashMap<TaxNode, ArrayList<TaxNode>> childMap; + public HashMap<String, ArrayList<TaxNode>> nameMap(){return nameMap;} + + @Deprecated + public int minValidTaxa=0; //TODO: Remove (will break serialization) + + /** Infer ranks for no-rank nodes, when possible */ + public boolean simplify=true; + /** See simplify() for details, works in conjunction with simplify */ + public boolean reassign=true; + /** Discard no-rank nodes */ + public boolean skipNorank=false; + public int inferRankLimit=0;//levelMap.get("species"); + + //Node Statistics + /** Number of bases assigned to this TaxID in RefSeq */ + private IntLongHashMap refseqSizeMap; + /** Number of bases assigned to this TaxID and descendants in RefSeq */ + private IntLongHashMap refseqSizeMapC; + /** Number of sequences assigned to this TaxID in RefSeq */ + private IntHashMap refseqSeqMap; + /** Number of sequences assigned to this TaxID and descendants in RefSeq */ + private IntLongHashMap refseqSeqMapC; + /** Number of descendant nodes, inclusive, for each TaxID */ + private IntHashMap nodeMapC; + + /*--------------------------------------------------------------*/ + /*---------------- Statics ----------------*/ + /*--------------------------------------------------------------*/ + + /** Assign levels to unranked nodes below species level, when possible */ + public static boolean assignStrains=true; + /** Assume headers are in Silva format */ + public static boolean SILVA_MODE=false; + /** Assume headers are in Unite format */ + public static boolean UNITE_MODE=false; + /** Probably unnecessary at this point... present for legacy reasons */ + public static boolean CRASH_IF_NO_GI_TABLE=true; + + public static boolean verbose=false; + public static boolean SHOW_WARNINGS=false; + + /** Maps IMG IDs to records from the dump file */ + private static HashMap<Long, ImgRecord> imgMap; + + /** Set to false if the tree is expected to be mutated. + * @TODO Remove mutable fields from the tree (like counters). + */ + public static boolean ALLOW_SHARED_TREE=true; + + /** Universal location of the shared TaxTree used by various classes */ + private static TaxTree sharedTree; + + /** A simpler and probably less safe version of sharedTree(...) */ + public static TaxTree getTree(){return sharedTree;} + + /** + * Fetch the shared tree, loading it from file if not present. + * @return A tree. + * @TODO: Check proper-construction of double-checked synchronize + */ + private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) { + if(!ALLOW_SHARED_TREE){return null;} + if(sharedTree==null && fname!=null){ + if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();} + synchronized(TaxTree.class){ + if(sharedTree==null){ + if(outstream!=null){outstream.println("Loading tax tree.");} + Timer t=new Timer(outstream, false); + setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat); + t.stopAndPrint(); + } + } + } + if(hashNames && sharedTree.nameMap==null){ + synchronized(sharedTree){ + if(sharedTree.nameMap==null){ + if(outstream!=null){outstream.println("Hashing names.");} + Timer t=new Timer(outstream, false); + sharedTree.hashNames(hashDotFormat); + t.stopAndPrint(); + } + } + } + return sharedTree; + } + + /** + * For initialization. Normally only one tree is needed by a process so it is set here. + * If the tree is already set nothing will happen, unless additional hashing is needed. + */ + private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){ + assert(ALLOW_SHARED_TREE); + assert(sharedTree==null); + sharedTree=tree; + if(hashNames && sharedTree.nameMap==null){ + synchronized(sharedTree){ + if(sharedTree.nameMap==null){ + sharedTree.hashNames(hashDotFormat); + } + } + } + } + + /** + * Determine whether a taxonomic level is standard. e.g.:<br> + * isSimple("phylum")=true<br> + * isSimple("subphylum")=false<br> + * isSimple("no-rank")=false + * @param levelExtended The extended level to test. + * @return True if this level is not no-rank, and the names of the normal and extended levels match. + */ + public static boolean isSimple(int levelExtended){ + int level=extendedToLevel(levelExtended); + return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)); + } + + /** + * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:<br> + * isSimple("phylum")=true<br> + * isSimple("substrain")=true<br> + * isSimple("subphylum")=false<br> + * isSimple("no-rank")=false + * @param levelExtended The extended level to test. + * @return True if this level is not no-rank, and the names of the normal and extended levels match. + */ + public static boolean isSimple2(int levelExtended){ + int level=extendedToLevel(levelExtended); + return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level) + || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E); + } + + /*--------------------------------------------------------------*/ + /*---------------- Constants ----------------*/ + /*--------------------------------------------------------------*/ + + /** Get the number for the normal level of this name */ + public static final int stringToLevel(String s){return altLevelMap.get(s);} + public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);} + /** Get the number for the extended level of this name */ + public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);} + /** Get the normal name for this normal level */ + public static final String levelToString(int x){return taxLevelNames[x];} + /** Get the extended name for this extended level */ + public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];} + /** Get the abbreviated name for this normal level */ + public static final String levelToStringShort(int x){return taxLevelNamesShort[x];} + + /** Normal, aka canonical, aka simple tax level names */ + private static final String[] taxLevelNames=new String[] { + "no rank", "subspecies", "species", "genus", + "family", "order", "class", "phylum", + "kingdom", "superkingdom", "domain", "life" + }; + public static final int numTaxLevelNames=taxLevelNames.length; + + /** + * Definitive representation of all NCBI taxonomic level names. + * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash. + * The first dimension maps normal ranks to extended ranks. + * Both dimensions are ordered ascending. + * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels(). + */ + //TODO See @TODO + private static final String[][] taxLevelNamesExtendedMatrix=new String[][] { + {"no rank"}, + {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup", + "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas", + "subspecies"}, + {"species"}, + {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"}, + {"subtribe", "tribe", "subfamily", "family"}, + {"superfamily", "parvorder", "infraorder", "suborder", "order"}, + {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"}, + {"superclass", "subdivision", "division", "subphylum", "phylum"}, + {"superphylum", "subkingdom", "kingdom"}, + {"superkingdom"}, + {"domain"}, + {"life"} + }; + + /** Extended tax level names as a 1D array */ + private static final String[] taxLevelNamesExtended=makeNamesExtended(); + /** Number of extended tax levels */ + public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length; + + /** Flatten the extended tax level names matrix to a 1D array */ + private static final String[] makeNamesExtended(){ + ArrayList<String> list=new ArrayList<String>(); + for(String[] s : taxLevelNamesExtendedMatrix){ + for(String ss : s){ + list.add(ss); + } + } + return list.toArray(new String[0]); + } + + /** Abbreviations of tax level names, mainly for semicolon form */ + private static final String[] taxLevelNamesShort=new String[] { + "nr", "ss", "s", "g", + "f", "o", "c", "p", + "k", "sk", "d", "l" + }; + + /** Normal tax level numbers as constants */ + public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3, + FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11; + + /** TaxID of Life node */ + public static final int LIFE_ID=1; + /** TaxID of Cellular Organisms node */ + public static final int CELLULAR_ORGANISMS_ID=131567; + /** TaxID of Bacteria node */ + public static final int BACTERIA_ID=2; //Is this safe? Who knows... + /** TaxID of Archaea node */ + public static final int ARCHAEA_ID=2157; + /** TaxID of Euk node */ + public static final int EUKARYOTA_ID=2759; + /** TaxID of Animal node */ + public static final int METAZOA_ID=33208, ANIMALIA_ID=33208; + /** TaxID of Plant node */ + public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090; + /** TaxID of Fungi node */ + public static final int FUNGI_ID=4751; + /** TaxID of Virus node */ + public static final int VIRUSES_ID=10239; + /** TaxID of Viroids node (now defunct) */ + public static final int VIROIDS_ID=12884; + + /** Maps normal level names to normal level numbers */ + private static final HashMap<String, Integer> levelMap=makeLevelMap(); + /** Maps extended level names to extended level numbers */ + private static final HashMap<String, Integer> levelMapExtended=makeLevelMapExtended(); + /** Maps extended level names to normal level numbers */ + private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap(); + + /** Common extended level numbers as constants */ + public static final int NO_RANK_E=NO_RANK, + SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"), + SUBSPECIES_E=stringToLevelExtended("subspecies"), + SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"), + FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"), + CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"), + KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"), + DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life"); + + /** Map of normal to extended level numbers */ + private static final int[] levelToExtended=new int[] { + NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E, + ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E + }; + + /** Map of extended to normal level numbers */ + private static final int[] extendedToLevel=makeExtendedToLevel(); + + /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */ + private static int[] makeExtendedToLevel(){ + int len=0; + for(String[] array : taxLevelNamesExtendedMatrix){ + len+=array.length; + } + int[] ret=new int[len]; + + int pos=0; + for(int level=0; level<taxLevelNamesExtendedMatrix.length; level++){ + String[] array=taxLevelNamesExtendedMatrix[level]; + for(int i=0; i<array.length; i++){ + ret[pos]=level; + pos++; + } + } + return ret; + } + + /** Convert a standard level number (like KINGDOM) to extended (like KINGDOM_E). */ + public static final int levelToExtended(int level){ + return level<0 ? level : levelToExtended[level]; + } + + /** Convert an extended level number (like PHYLUM_E) to extended (like PHYLUM). + * Non-standard levels will be converted to the next higher standard level; + * e.g., subphylum -> phylum */ + public static final int extendedToLevel(int extended){ + return extended<0 ? -1 : extendedToLevel[extended]; + } + + /* Pre-compiled delimiters to save time when splitting lines */ + private static final Pattern delimiterTab = Pattern.compile("\t"); + private static final Pattern delimiter = Pattern.compile("\t\\|\t"); + private static final Pattern delimiterPipe = Pattern.compile("\\|"); + private static final Pattern delimiterTilde = Pattern.compile("\\~"); + private static final Pattern delimiter2 = Pattern.compile("[\\s_]+"); + + public static boolean IMG_HQ=false; + + /* For these fields, see the corresponding functions, below. + * They define the default paths to various data on NERSC. */ + + private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest"; + private static final String defaultTaxPathAws="/test1/tax/latest"; + private static final String defaultTaxPathIGBVM="/data/tax/latest"; + private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; + private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; + private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; + private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz"; + private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz"; + private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz"; + + private static final String defaultImgFile="TAX_PATH/imgDump.txt"; + private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz"; + private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz"; + private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz"; + private static final String defaultPatternFile="TAX_PATH/patterns.txt"; + private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz"; + + private static final String defaultAccessionFile= + //"TAX_PATH/shrunk.protF.accession2taxid.gz," + + "TAX_PATH/shrunk.prot.accession2taxid.gz," + + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz," + + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz," + + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz," +// + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz," + + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz," +// + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz," + + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz," + + "TAX_PATH/shrunk.pdb.accession2taxid.gz"; + + /** For setting TAX_PATH, the root to taxonomy files */ + public static final String defaultTaxPath(){ + return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc; + } + + /** 16S consensus sequences per TaxID */ + public static final String default16SFile(){ + return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc; + } + + /** 18S consensus sequences per TaxID */ + public static final String default18SFile(){ + return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc; + } + + /** Path to all taxonomy files, substituted in to make specific file paths */ + public static String TAX_PATH=defaultTaxPath(); + + /** Location of gitable.int2d.gz for gi lookups */ + public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);} + /** Location of tree.taxtree.gz */ + public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);} + + //Use the prot.FULL.gz ncbi file. + public static boolean protFull=false; + + /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */ + public static final String defaultAccessionFile(){ + String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile; + return s.replaceAll("TAX_PATH", TAX_PATH); + } + /** Location of patterns.txt, which holds information about observed accession string formats */ + public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);} + /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */ + public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);} + /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */ + public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);} + + /** Screen output gets printed here */ + private static PrintStream outstream=System.out; + +}