Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/TaxTree.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package tax; import java.io.PrintStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.regex.Pattern; import fileIO.ByteFile; import fileIO.ReadWrite; import fileIO.TextFile; import shared.Parse; import shared.Parser; import shared.PreParser; import shared.Shared; import shared.Timer; import shared.Tools; import structures.ByteBuilder; import structures.IntHashMap; import structures.IntList; import structures.IntLongHashMap; /** * Represents a taxonomic tree. * Usually just one of these needs to be created for a process. * Designed for NCBI's taxdmp.zip file contents. * @author Brian Bushnell * @date Mar 6, 2015 * */ public class TaxTree implements Serializable{ /** * */ private static final long serialVersionUID = 5894416521711540017L; /*--------------------------------------------------------------*/ /*---------------- Main ----------------*/ /*--------------------------------------------------------------*/ /** * Code entrance from the command line. * This is not called normally, only when converting NCBI text files * into a binary representation and writing it to disk. * @param args Command line arguments */ public static void main(String[] args){ {//Preparse block for help, config files, and outstream PreParser pp=new PreParser(args, outstream, null, false); args=pp.args; outstream=pp.outstream; } assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n"; ReadWrite.USE_UNPIGZ=true; ReadWrite.USE_PIGZ=true; ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9); ReadWrite.PIGZ_BLOCKSIZE=256; ReadWrite.PIGZ_ITERATIONS=60; Timer t=new Timer(); TaxTree tree=new TaxTree(args); t.stop(); outstream.println("Retained "+tree.nodeCount+" nodes:"); for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){ outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]); if(verbose){ int lim=10; for(int j=0; j<lim && j<tree.treeLevelsExtended[i].length; j++){ TaxNode n=tree.treeLevelsExtended[i][j]; outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); } for(int j=tree.treeLevelsExtended[i].length-lim; j<tree.treeLevelsExtended[i].length; j++){ if(j>=lim){ TaxNode n=tree.treeLevelsExtended[i][j]; outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); } } } outstream.println(); } outstream.println(); outstream.println("Time: \t"+t); if(args.length>2){//Write a tree ReadWrite.write(tree, args[3], true); } } /** Parse arguments from the command line */ private Parser parse(String[] args){ //Create a parser object Parser parser=new Parser(); //Set any necessary Parser defaults here //parser.foo=bar; //Parse each argument for(int i=0; i<args.length; i++){ String arg=args[i]; //Break arguments into their constituent parts, in the form of "a=b" String[] split=arg.split("="); String a=split[0].toLowerCase(); String b=split.length>1 ? split[1] : null; if(b!=null && b.equalsIgnoreCase("null")){b=null;} if(a.equals("verbose")){ verbose=Parse.parseBoolean(b); }else if(a.equals("parse_flag_goes_here")){ long fake_variable=Parse.parseKMG(b); //Set a variable here }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser //do nothing }else if(i>3){ outstream.println("Unknown parameter "+args[i]); assert(false) : "Unknown parameter "+args[i]; } } return parser; } /*--------------------------------------------------------------*/ /*---------------- Initialization ----------------*/ /*--------------------------------------------------------------*/ /*--------------------------------------------------------------*/ /*---------------- Constructors ----------------*/ /*--------------------------------------------------------------*/ /** * Constructor using filenames from command line arguments, in the format of: * {names, nodes, merged} * @param args Command line arguments */ private TaxTree(String[] args){ this(args[0], args[1], args[2], args); } /** * @param namesFile NCBI names.txt * @param nodesFile NCBI nodes.txt * @param mergedFile NCBI merged.txt * @param args */ private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){ if(args!=null) { Parser parser=parse(args); } nodes=getNames(namesFile); getNodes(nodesFile, nodes); mergedMap=getMerged(mergedFile); countChildren(); outstream.println("Counted children."); int rounds=percolate(); outstream.println("Percolated "+rounds+" rounds to fixpoint."); if(assignStrains){ assignStrains(); rounds=percolate(); outstream.println("Percolated "+rounds+" rounds to fixpoint."); } if(simplify){ if(verbose){outstream.println("Simplifying.");} int removed=simplify(nodes); if(verbose){outstream.println("Removed "+removed+" nodes.");} rounds=percolate(); outstream.println("Percolated "+rounds+" rounds to fixpoint."); } int errors=test(nodes); // assert(errors==0); //Not possible since the tree is wrong. if(errors>0) { System.err.println("Found "+errors+" errors in tree."); } for(TaxNode n : nodes){ if(n!=null){ nodesPerLevel[n.level]++; nodesPerLevelExtended[n.levelExtended]++; } } // for(int i=0; i<nodesPerLevel.length; i++){ // treeLevels[i]=new TaxNode[nodesPerLevel[i]]; // } for(int i=0; i<nodesPerLevelExtended.length; i++){ treeLevelsExtended[i]=new TaxNode[nodesPerLevelExtended[i]]; } // { // int[] temp=new int[nodesPerLevel.length]; // for(TaxNode n : nodes){ // if(n!=null){ // int level=n.level; // treeLevels[level][temp[level]]=n; // temp[level]++; // } // } // } { int[] temp=new int[nodesPerLevelExtended.length]; for(TaxNode n : nodes){ if(n!=null){ int level=n.levelExtended; treeLevelsExtended[level][temp[level]]=n; temp[level]++; } } } nodeCount=(int)Tools.sum(nodesPerLevelExtended); } /*--------------------------------------------------------------*/ /*--------------- Loaders ----------------*/ /*--------------------------------------------------------------*/ /** * Load a tax tree from disk. * @param taxTreeFile Serialized TaxTree. * @param hashNames Hash nodes using names as keys * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens * @return */ public static final TaxTree loadTaxTree(String taxTreeFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){ if(taxTreeFile==null){return null;} return loadTaxTree(taxTreeFile, null, null, null, outstream, hashNames, hashDotFormat); } /** * Load a tax tree from disk, either from a binary tree file, * or from NCBI text files. * @param taxTreeFile Binary representation; mutually exclusive with other files. * @param taxNameFile NCBI names.txt * @param taxNodeFile NCBI nodes.txt * @param taxMergedFile NCBI merged.txt * @param hashNames Hash nodes using names as keys * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens * @return The loaded tree */ public static final TaxTree loadTaxTree(String taxTreeFile, String taxNameFile, String taxNodeFile, String taxMergedFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){ if(taxTreeFile!=null || taxNodeFile==null){ TaxTree tree=sharedTree(taxTreeFile, hashNames, hashDotFormat, outstream); if(tree!=null){return tree;} } if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=defaultTreeFile();} assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files."; Timer t=new Timer(); if(outstream!=null){outstream.print("\nLoading tax tree; ");} final TaxTree tree; if(taxTreeFile!=null){ tree=ReadWrite.read(TaxTree.class, taxTreeFile, true); }else{ tree=new TaxTree(taxNameFile, taxNodeFile, taxMergedFile, null); } t.stop(); if(hashNames){ if(outstream!=null){outstream.println("Hashing names.");} tree.hashNames(hashDotFormat); } if(outstream!=null){ outstream.println("Time: \t"+t); Shared.printMemory(); outstream.println(); } if(ALLOW_SHARED_TREE){sharedTree=tree;} return tree; } /*--------------------------------------------------------------*/ /*--------------- Constructor Helpers ----------------*/ /*--------------------------------------------------------------*/ /** * Finds unranked nodes in the archaeal and bacterial kingdoms. * If these are below species level, have a ranked parent, * and have no ranked children, they are assigned strain or substrain. */ private void assignStrains(){ outstream.println("Assigning strains."); int strains=0, substrains=0; TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed yet TaxNode archaea=getNode(ARCHAEA_ID); assert(bacteria.name.equalsIgnoreCase("Bacteria")); assert(archaea.name.equalsIgnoreCase("Archaea")); ArrayList<TaxNode> bactList=new ArrayList<TaxNode>(); ArrayList<TaxNode> archList=new ArrayList<TaxNode>(); for(TaxNode tn : nodes){ if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){ if(descendsFrom(tn, bacteria)){ bactList.add(tn); }else if(descendsFrom(tn, archaea)){ archList.add(tn); } } } ArrayList<TaxNode> prokList=new ArrayList<TaxNode>(bactList.size()+archList.size()); prokList.addAll(bactList); prokList.addAll(archList); for(TaxNode tn : prokList){ if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ TaxNode parent=nodes[tn.pid]; if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ tn.levelExtended=STRAIN_E; tn.level=SUBSPECIES; tn.setOriginalLevel(STRAIN_E); strains++; } } } // outstream.println("Assigned "+strains+" strains."); for(TaxNode tn : prokList){ if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ TaxNode parent=nodes[tn.pid]; if(parent.levelExtended==STRAIN_E){ tn.levelExtended=SUBSTRAIN_E; tn.level=SUBSPECIES; tn.setOriginalLevel(SUBSTRAIN_E); substrains++; } } } // outstream.println("Assigned "+substrains+" substrains."); } @Deprecated private void assignStrainsOld(){ outstream.println("Assigning strains."); int strains=0, substrains=0; TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed assert(bacteria.name.equalsIgnoreCase("Bacteria")); for(TaxNode tn : nodes){ if(tn!=null && tn.originalLevel()==NO_RANK){ TaxNode parent=nodes[tn.pid]; if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){ // nodesPerLevelExtended[STRAIN_E]++; // nodesPerLevelExtended[tn.levelExtended]--; tn.levelExtended=STRAIN_E; tn.level=SUBSPECIES; tn.setOriginalLevel(STRAIN_E); strains++; } } } // outstream.println("Assigned "+strains+" strains."); for(TaxNode tn : nodes){ if(tn!=null && tn.originalLevel()==NO_RANK){ TaxNode parent=nodes[tn.pid]; if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){ // nodesPerLevelExtended[SUBSTRAIN_E]++; // nodesPerLevelExtended[tn.levelExtended]--; tn.levelExtended=SUBSTRAIN_E; tn.level=SUBSPECIES; tn.setOriginalLevel(SUBSTRAIN_E); substrains++; } } } // outstream.println("Assigned "+substrains+" substrains."); } /*--------------------------------------------------------------*/ /*---------------- Construction ----------------*/ /*--------------------------------------------------------------*/ /** * Create tax nodes using names in the designated file. * @param fname NCBI names.txt * @return Array of created nodes, where array[x] contains the node with TaxID x. */ private static TaxNode[] getNames(String fname){ ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000); int max=0; TextFile tf=new TextFile(fname, false); for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ if(s.contains("scientific name")){ String[] split=delimiter.split(s, 3); assert(split.length==3) : s; int id=Integer.parseInt(split[0]); String name=split[1]; if(id==1 && name.equalsIgnoreCase("root")){name="Life";} max=Tools.max(max, id); list.add(new TaxNode(id, name)); } } TaxNode[] nodes=new TaxNode[max+1]; for(TaxNode n : list){ assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n; nodes[n.id]=n; } return nodes; } /** * Parses names file a second time to fill in additional information. * Should really be merged into getNames. * @TODO Merge into getNames */ private static TaxNode[] getNodes(String fname, TaxNode[] nodes){ int max=0; LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>(); TextFile tf=new TextFile(fname, false); for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ String[] split=delimiter.split(s, 4); assert(split.length==4) : s; int id=-1, pid=-1, level=-1, levelExtended=-1; id=Integer.parseInt(split[0]); try { pid=Integer.parseInt(split[1]); } catch (NumberFormatException e) { // TODO Auto-generated catch block e.printStackTrace(); System.err.println("Bad line: "+s+"\n"+Arrays.toString(split)); } boolean alt=false; { String key=split[2]; Integer obj0=levelMap.get(key); Integer obj=levelMapExtended.get(key); assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split); if(obj0==null){ obj0=altLevelMap.get(key); alt=true; } if(obj0!=null){ level=obj0; levelExtended=obj; if(id==pid){ level=LIFE; levelExtended=LIFE_E; alt=false; } }else{ if(id==pid){ level=LIFE; levelExtended=LIFE_E; alt=false; }else{ int[] count=oddNames.get(key); if(count==null){ count=new int[1]; oddNames.put(key, count); } count[0]++; } } } max=Tools.max(max, id); TaxNode n=nodes[id]; assert(n!=null && n.pid<0) : n+" -> "+s; n.pid=pid; n.level=level; n.levelExtended=levelExtended; n.setOriginalLevel(levelExtended); n.setCanonical(!alt); assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n"; } if(oddNames.size()>0){ outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:"); if(verbose){ for(String s : oddNames.keySet()){ outstream.println(oddNames.get(s)[0]+"\t"+s); } } } return nodes; } /** * Count child nodes of each node. * This can be used to size arrays or determine which nodes are leaves. */ private void countChildren(){ for(TaxNode child : nodes){ if(child!=null && child.pid!=child.id){ TaxNode parent=getNode(child.pid); if(parent!=child){parent.numChildren++;} } } } //TODO - This could be finished in 2 passes using the childTable. /** * Fill derived fields minParentLevelExtended and maxChildLevelExtended * by percolating information through the tree until a fixpoint is reached. * @TODO This could be finished in 2 passes using the childTable. * @return Number of rounds required to reach fixpoint. */ private int percolate(){ boolean changed=true; int rounds=0; while(changed){ changed=false; rounds++; for(TaxNode child : nodes){ if(child!=null && child.pid!=child.id){ TaxNode parent=getNode(child.pid); changed=(child.discussWithParent(parent) | changed); } } if(!changed){break;} changed=false; rounds++; for(int i=nodes.length-1; i>=0; i--){ TaxNode child=nodes[i]; if(child!=null && child.pid!=child.id){ TaxNode parent=getNode(child.pid); changed=(child.discussWithParent(parent) | changed); } } } return rounds; } /** * Load nodes into the nameMap and nameMapLower, mapped to their names. * @param genusDotSpecies Also hash abbreviations such as E.coli. */ public synchronized void hashNames(boolean genusDotSpecies){ if(nameMap!=null){return;} assert(nameMap==null); assert(nameMapLower==null); final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN)); nameMap=new HashMap<String, ArrayList<TaxNode>>(size); nameMapLower=new HashMap<String, ArrayList<TaxNode>>(size); //Hash the names, both lowercase and uppercase for(TaxNode n : nodes){ if(n!=null){ String name=n.name; if(name.indexOf('_')>=0){ name=name.replace('_', ' ').trim(); } if(name!=null && !name.equals("environmental samples")){ { ArrayList<TaxNode> list=nameMap.get(name); if(list==null){ list=new ArrayList<TaxNode>(); nameMap.put(name, list); } list.add(n); } { String lc=name.toLowerCase(); ArrayList<TaxNode> list=nameMapLower.get(lc); if(list==null){ list=new ArrayList<TaxNode>(); nameMapLower.put(lc, list); } list.add(n); } } } } //Hash G.species versions of the names, both lowercase and uppercase if(genusDotSpecies){ ByteBuilder bb=new ByteBuilder(64); for(TaxNode n : nodes){ if(n!=null && n.levelExtended==SPECIES_E){ String name=n.name; if(name.indexOf('_')>=0){ name=name.replace('_', ' ').trim(); } if(name!=null && !name.equals("environmental samples")){ final String dotFormat=dotFormat(name, bb); if(dotFormat!=null){ { ArrayList<TaxNode> list=nameMap.get(dotFormat); if(list==null){ list=new ArrayList<TaxNode>(); nameMap.put(dotFormat, list); } list.add(n); } { String lc=dotFormat.toLowerCase(); ArrayList<TaxNode> list=nameMapLower.get(lc); if(list==null){ list=new ArrayList<TaxNode>(); nameMapLower.put(lc, list); } list.add(n); } } } } } } } /** * Generate the "dot format" name of a node. * For example, transform "Homo sapiens" to "H.sapiens" * @param name Node name * @param buffer A ByteBuilder that may be modified * @return Dot format */ private static String dotFormat(String name, ByteBuilder buffer){ if(name==null || name.indexOf('.')>=0){return null;} final int firstSpace=name.indexOf(' '); if(firstSpace<0 || firstSpace>=name.length()-1){return null;} final int lastSpace=name.lastIndexOf(' '); if(firstSpace!=lastSpace){return null;} final String a=name.substring(0, firstSpace); final String b=name.substring(lastSpace+1); final char ca=a.charAt(0); final char cb=b.charAt(0); if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;} if(buffer==null){buffer=new ByteBuilder(2+b.length());} else{buffer.clear();} buffer.append(ca).append('.').append(b); return buffer.toString(); } /** * Fill childMap, which maps nodes to their children. */ public synchronized void hashChildren(){ assert(childMap==null); int nodesWithChildren=0; for(TaxNode tn : nodes){ if(tn!=null && tn.numChildren>0){nodesWithChildren++;} } childMap=new HashMap<TaxNode, ArrayList<TaxNode>>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN)); for(TaxNode tn : nodes){ if(tn!=null){ if(tn.numChildren>0){ childMap.put(tn, new ArrayList<TaxNode>(tn.numChildren)); } } } for(TaxNode tn : nodes){ if(tn!=null){ if(tn.id!=tn.pid){ ArrayList<TaxNode> list=childMap.get(getNode(tn.pid)); if(list!=null){list.add(tn);} } } } } /** * Fetch this node's children. * @param parent Node in question * @return List of child nodes */ public ArrayList<TaxNode> getChildren(TaxNode parent){ if(parent.numChildren<1){return null;} if(childMap!=null){return childMap.get(parent);} ArrayList<TaxNode> list=new ArrayList<TaxNode>(parent.numChildren); for(TaxNode tn : nodes){ if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){ list.add(tn); } } return list; } /** * Load a map of old to new TaxIDs. * @param mergedFile NCBI merged.txt. * @return Map of old to new TaxIDs */ private static IntHashMap getMerged(String mergedFile) { if(mergedFile==null){return null;} String[] lines=TextFile.toStringLines(mergedFile); if(lines.length<1){return null;} IntHashMap map=new IntHashMap((int)(lines.length*1.3)); for(String line : lines){ String[] split=delimiterTab.split(line); int a=Integer.parseInt(split[0]); int b=Integer.parseInt(split[2]); map.put(a, b); } return map; } /** * Simplify the tree by assigning ranks to unranked nodes, * where possible, through inference. * Optionally removes unranked nodes based on the skipNorank field. * @param nodes Array of all TaxNodes. * @return Number of nodes removed. */ private int simplify(TaxNode nodes[]){ int failed=test(nodes); int removed=0; int reassigned=0; if(reassign){ boolean changed=true; int changedCount=0; while(changed){ changed=false; for(int i=0; i<nodes.length; i++){ TaxNode n=nodes[i]; if(n!=null && n.levelExtended<1){ int pid=n.pid; TaxNode parent=nodes[pid]; assert(parent!=null) : n; if(n.maxDescendantLevelIncludingSelf()<SUBSPECIES_E && parent.minAncestorLevelIncludingSelf()<=SPECIES_E && parent.minAncestorLevelIncludingSelf()>=SUBSPECIES_E){ // if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ changed=true; n.levelExtended=SUBSPECIES_E; n.level=SUBSPECIES; changedCount++; } } } } System.err.println("Assigned levels to "+changedCount+" unranked nodes."); } if(skipNorank){//Skip nodes with unknown taxa if(verbose){outstream.println("A0");} for(int i=0; i<nodes.length; i++){ TaxNode n=nodes[i]; if(n!=null){ int pid=n.pid; TaxNode parent=nodes[pid]; assert(parent!=null) : n; assert(parent!=n || pid==1) : n+", "+pid; while(parent.levelExtended<1 && n.levelExtended>parent.levelExtended){ //System.err.println("Reassigned from "+parent); assert(parent.id!=parent.pid); parent=nodes[parent.pid]; n.pid=parent.id; reassigned++; } } } for(int i=0; i<nodes.length; i++){ if(nodes[i]!=null && nodes[i].levelExtended<0){ System.err.println("Removed "+nodes[i]); nodes[i]=null; removed++; } } if(verbose){outstream.println("Skipped "+reassigned+" unranked parents, removed "+removed+" invalid nodes.");} } if(inferRankLimit>0){//Infer level for unset nodes (from "no rank") if(verbose){outstream.println("A");} int changed=1; while(changed>0){ changed=0; for(final TaxNode n : nodes){ if(n!=null){ if(n.levelExtended==0){ TaxNode parent=nodes[n.pid]; if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){ n.levelExtended=Tools.max(1, parent.levelExtended-1); assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit); changed++; } } } } if(verbose){outstream.println("changed: "+changed);} } // outstream.println("B"); // for(TaxNode n : nodes){ // if(n!=null && n.level==0){ // n.level=-1; // } // } } failed=test(nodes); // if(reassign){//Skip nodes with duplicate taxa // if(verbose){outstream.println("D");} // int changed=1; // while(changed>0){ // changed=0; // for(final TaxNode n : nodes){ // if(n!=null){ // TaxNode parent=nodes[n.pid]; // TaxNode grandparent=nodes[parent.pid]; // assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent; // assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent; // // while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) || // n.level>parent.level || (n.level==parent.level))){ // parent=grandparent; // grandparent=nodes[parent.pid]; // n.pid=parent.id; // reassigned++; // changed++; // } // } // } // if(verbose){outstream.println("changed: "+changed);} // } // if(verbose){outstream.println("E");} // for(int i=0; i<nodes.length; i++){ // if(nodes[i]!=null && nodes[i].level<0){ // nodes[i]=null; // removed++; // } // } // } failed=test(nodes); if(verbose){outstream.println("F");} {//Ensure the tree is now clean for(int i=0; i<nodes.length; i++){ TaxNode n=nodes[i]; if(n!=null){ TaxNode parent=nodes[n.pid]; TaxNode grandparent=nodes[parent.pid]; assert(n==parent || n.levelExtended<=parent.levelExtended || !n.canonical() || n.levelExtended<1 || parent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; } } } // if(verbose){System.err.println("Reassignments: "+reassigned);} return removed; } /*--------------------------------------------------------------*/ /*---------------- Validation ----------------*/ /*--------------------------------------------------------------*/ /** * Ensure tree has monotonically increasing (or nondescending) ranks. * @param nodes All TaxNodes. * @return Number of violations. */ private static int test(TaxNode[] nodes){ int failed=0; for(final TaxNode n : nodes){ if(n!=null){ TaxNode parent=nodes[n.pid]; try { assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) : "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n" + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended; assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent; // assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical() || !parent.canonical()) : n+" -> "+parent; if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){ if(verbose){outstream.println("Error: "+n+" -> "+parent);} failed++; }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){ // if(verbose){outstream.println("Error: "+n+" -> "+parent);} // failed++; } assert(n!=parent || n.id<=1) : n; } catch (Throwable e) { // TODO Auto-generated catch block e.printStackTrace(); failed++; } } } if(verbose || failed>0){outstream.println(failed+" nodes failed.");} return failed; } /*--------------------------------------------------------------*/ /*---------------- Print Methods ----------------*/ /*--------------------------------------------------------------*/ /** * Format full name in semicolon format, e.g. * "SK:Bacteria;P:Protobacteria;..." * @param tn0 Base node * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. * @return Resultant String */ public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){ StringBuilder sb=new StringBuilder(); if(tn0==null){return "Not found";} String semi=""; ArrayList<TaxNode> list=toAncestors(tn0, skipNonCanonical); boolean addTaxLevel=true; for(int i=list.size()-1; i>=0; i--){ sb.append(semi); TaxNode tn=list.get(i); if(tn.id!=LIFE_ID || list.size()==1){ if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){ sb.append(tn.levelToStringShort()).append(':'); } sb.append(mononomial ? mononomial(tn) : tn.name); semi=";"; } } return sb.toString(); } /** * Return a list of TaxIDs of all ancestors. * @param tn0 Base node * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. * @return List of TaxIDs. */ public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){ if(tn0==null){return null;} IntList list=new IntList(8); TaxNode tn=tn0; while(tn!=null){ if(!skipNonCanonical || tn.isSimple()){ if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);} } if(tn.pid==tn.id){break;} tn=getNode(tn.pid); } if(list.isEmpty()){list.add(tn0.id);} return list; } /** * Return a list of all ancestors. * @param tn0 Base node * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. * @return List of ancestor nodes. */ public ArrayList<TaxNode> toAncestors(final TaxNode tn0, boolean skipNonCanonical){ if(tn0==null){return null;} ArrayList<TaxNode> list=new ArrayList<TaxNode>(8); TaxNode tn=tn0; while(tn!=null){ if(!skipNonCanonical || tn.isSimple()){ if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);} } if(tn.pid==tn.id){break;} tn=getNode(tn.pid); } if(list.isEmpty()){list.add(tn0);} return list; } /** * Generate a path to the genome of an organism on the filesystem; * used by ExplodeTree. Intended for internal JGI use. * @param root Location of the exploded tree. * @return Path to a genome. */ public String toDir(TaxNode node, String root){ StringBuilder sb=new StringBuilder(); if(root==null){root="";} sb.append(root); if(root.length()>0 && !root.endsWith("/")){sb.append('/');} IntList list=toAncestorIds(node, false); list.reverse(); assert(list.get(0)==1) : list + "," +getNode(list.get(0)); for(int i=0; i<list.size(); i++){ sb.append(list.get(i)); sb.append('/'); } return sb.toString(); } /*--------------------------------------------------------------*/ /*---------------- Outer Methods ----------------*/ /*--------------------------------------------------------------*/ /** * Use various techniques to get a TaxID from an unknown String, such as parsing, * name lookups, and accession translation. * @param s String to process. * @return Decoded TaxID. */ public static int getID(String s){return GiToTaxid.getID(s);} /** * Use various techniques to get a TaxID from an unknown byte[], such as parsing, * name lookups, and accession translation. * @param s String to process. * @return Decoded TaxID. */ public static int getID(byte[] s){return GiToTaxid.getID(s);} /** Return the lowest ancestor of the named node with taxonomic level at least minLevel */ public TaxNode getNode(String s, int minLevelExtended){ TaxNode tn=parseNodeFromHeader(s, true); while(tn!=null && tn.levelExtended<minLevelExtended && tn.pid!=tn.id){ tn=getNode(tn.pid); } return tn; } /** * Determine whether a node is a descendant of another. * @param child Possible child TaxID. * @param parent Possible parent TaxID. * @return true iff child descends from parent. */ public boolean descendsFrom(final int child, final int parent){ TaxNode cn=getNode(child), pn=getNode(parent); assert(cn!=null) : "Invalid taxID: "+child; assert(pn!=null) : "Invalid taxID: "+parent; return descendsFrom(cn, pn); } /** * Determine whether a node is a descendant of another. * @param child Possible child node. * @param parent Possible parent node. * @return true iff child descends from parent. */ public boolean descendsFrom(TaxNode child, TaxNode parent){ assert(child!=null && parent!=null) : "Null parameters."; if(child==null || parent==null){return false;} while(child!=parent && child.levelExtended<=parent.levelExtended && child.id!=child.pid){ child=getNode(child.pid); } return child==parent; } /** * Determine whether an organism is classified as X. * @param taxID taxID of organism. * @param ancestorID taxID of possible ancestor. * @return true if the organism is an X. */ public boolean descendsFrom2(int taxID, final int ancestorID) { TaxNode tn=getNode(taxID); while(tn.id!=tn.pid){ if(tn.id==ancestorID){return true;} tn=getNode(tn.pid); } return false; } /** Determine whether an organism is classified as a plant. */ public boolean isPlant(int taxID) {return descendsFrom2(taxID, VIRIDIPLANTAE_ID);} /** Determine whether an organism is classified as an animal. */ public boolean isAnimal(int taxID) {return descendsFrom2(taxID, METAZOA_ID);} /** Determine whether an organism is classified as a fungus. */ public boolean isFungus(int taxID) {return descendsFrom2(taxID, FUNGI_ID);} /** Determine whether an organism is classified as a eukaryote. */ public boolean isEukaryote(int taxID) {return descendsFrom2(taxID, EUKARYOTA_ID);} /** Determine whether an organism is classified as a prokaryote. */ public boolean isProkaryote(int taxID) { TaxNode tn=getNode(taxID); if(tn==null){ System.err.println("*** Warning: Can't find node "+taxID+" ***"); return false; } while(tn.id!=tn.pid){ if(tn.id==BACTERIA_ID || tn.id==ARCHAEA_ID){return true;} tn=getNode(tn.pid); } return false; } /** * Calculate the common ancestor of two nodes. * @param a TaxID of a node. * @param b TaxID of a node. * @return Common ancestor ID of a and b. */ public int commonAncestor(final int a, final int b){ TaxNode an=getNode(a), bn=getNode(b); assert(an!=null) : "Invalid taxID: "+a; assert(bn!=null) : "Invalid taxID: "+b; TaxNode cn=commonAncestor(an, bn); assert(cn!=null) : "No common ancestor: "+an+", "+bn; if(cn==null){return -1;} return cn.id; } /** * Calculate the common ancestor of two nodes. * @param a A node. * @param b A node. * @return Common ancestor of a and b. */ public TaxNode commonAncestor(TaxNode a, TaxNode b){ assert(a!=null && b!=null) : "Null parameters."; if(a==null){return b;} if(b==null){return a;} while(a!=b){ if(a.levelExtended<b.levelExtended){ a=getNode(a.pid); }else{ b=getNode(b.pid); } } return a; } /** * Identify the highest ancestor of a node; * this will presumably be "Life". * @param a Node * @return Highest ancestor */ public TaxNode highestAncestor(TaxNode a){ assert(a!=null); while(a.id!=a.pid){a=getNode(a.pid);} return a; } /*--------------------------------------------------------------*/ /*---------------- Header Parsing ----------------*/ /*--------------------------------------------------------------*/ /** * Determine the TaxID of a String, * without a loaded TaxTree. * This only works if the literal TaxID is embedded in the String. * @param header Typically a sequence header * @return Decoded TaxID, or -1 if unsuccessful */ public static int parseHeaderStatic(String header){ if(header.length()<3){return -1;} if(header.charAt(0)=='>'){header=header.substring(1);} if(!header.startsWith("tid|")){return -1;} int idx=3; int idx2=header.indexOf('|', 4); if(idx2<5){return -1;} int id=-1; try { id=Parse.parseInt(header, idx+1, idx2); // System.err.println("d"+", "+header.substring(idx+1, idx2)); } catch (Throwable e) { // System.err.println("e"+", "+header.substring(idx+1, idx2)); //ignore } return id; } /** * Determine the TaxID of a String. * @param header Typically a sequence header * @param bestEffort In some cases, try certain substrings if the name is not found. * @return */ public TaxNode parseNodeFromHeader(String header, boolean bestEffort){ if(header==null || header.length()<2){return null;} if(header.charAt(0)=='>'){header=header.substring(1);} TaxNode tn; if(SILVA_MODE){ tn=getNodeSilva(header, bestEffort); }else if(UNITE_MODE){ tn=getNodeUnite(header, bestEffort); }else{ final char delimiter=ncbiHeaderDelimiter(header); if(delimiter==' '){ tn=getNodeNewStyle(header); }else{ tn=getNodeOldStyle(header, delimiter); if(tn==null && delimiter=='|'){ // System.err.println("A: "+header); int id=-1; String[] split=header.split("\\|"); if(AccessionToTaxid.LOADED()){ for(int i=0; i<split.length && id<0; i++){//Try accessions first if(AccessionToTaxid.isValidAccession(split[i])){ id=AccessionToTaxid.get(split[i]); } } } for(int i=0; i<split.length && id<0; i++){//Then names id=parseNameToTaxid(split[i]); } // System.err.println("E: "+id); if(id>=0){tn=getNode(id);} // System.err.println("F: "+tn); } } } return tn; } /** * Guess the delimiter character in a String; * typically assumed to be '|', '~', or ' '. */ public static char ncbiHeaderDelimiter(String header){ for(int i=0; i<header.length(); i++){ final char c=header.charAt(i); if(c=='|' || c=='~'){ assert(i>0) : "i="+i+"; malformatted header '"+header+"'"; return c; }else if(Character.isWhitespace(c)){ return ' '; } } return ' '; } /** * Parse a Silva header to a Node. * @param s Silva header. * @param bestEffort Try certain substrings if the name is not found. * @return Node */ TaxNode getNodeSilva(String s, boolean bestEffort){ if(s==null){return null;} if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ return getNodeOldStyle(s, s.charAt(3)); } String[] split=Tools.semiPattern.split(s); int number=-1; // final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast")); // if(chloroplast){return null;} for(int i=split.length-1; number<0 && i>=0; i--){ String last=split[i]; int paren=last.indexOf('('); if(paren>=0){last=last.substring(0, paren);} last=last.trim(); if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){ number=parseNameToTaxid(last); } if(number>=0){return getNode(number);} else if(!bestEffort){break;} } return null; } /** * Parse a Unite header to a Node. * @param s Unite header. * @param bestEffort Try certain substrings if the name is not found. * @return Node */ TaxNode getNodeUnite(String s, boolean bestEffort){ if(s==null){return null;} if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ return getNodeOldStyle(s, s.charAt(3)); } String[] split=Tools.pipePattern.split(s); int number=-1; String name=split[0]; String acc=split[1]; if(AccessionToTaxid.LOADED() && acc.length()>0){ number=AccessionToTaxid.get(acc); } if(number<1){ TaxNode tn=getNodeByName(name); if(tn!=null){number=tn.id;} } if(number>=0){return getNode(number);} return null; } /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */ private TaxNode getNodeOldStyle(final String s, char delimiter){ { int index=s.indexOf(delimiter); if(index<0){ delimiter='~'; index=s.indexOf(delimiter); if(index<0){ delimiter='_'; index=s.indexOf(delimiter); } } int number=-1; Throwable e=null; if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){ // System.err.println("Parsing gi number."); if(GiToTaxid.isInitialized()){ try { number=GiToTaxid.parseGiToTaxid(s, delimiter); } catch (Throwable e2) { e=e2; } }else{ assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s; } // if(number!=-1){System.err.println("number="+number);} }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){ // System.err.println("Parsing ncbi number."); number=GiToTaxid.parseTaxidNumber(s, delimiter); }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){ // System.err.println("Parsing ncbi number."); long img=parseDelimitedNumber(s, delimiter); ImgRecord record=imgMap.get(img); number=(record==null ? -1 : record.taxID); }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete // System.err.println("Parsing ncbi number."); number=GiToTaxid.parseTaxidNumber(s, delimiter); } if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){ String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s)); if(AccessionToTaxid.LOADED()){ number=parseAccessionToTaxid(split); } if(number<0){ number=parseHeaderNameToTaxid(split); } } if(number<0 && e!=null){ assert(false) : e; throw new RuntimeException(e); } //TaxServer code could go here... if(number>=0){return getNode(number);} } if(verbose){System.err.println("Can't process name "+s);} if(Tools.isDigit(s.charAt(0)) && s.length()<=9){ try { return getNode(Integer.parseInt(s)); } catch (NumberFormatException e) { //ignore } } return null; } /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */ static long parseDelimitedNumber(String s, char delimiter){ if(s==null){return -1;} int i=0; while(i<s.length() && s.charAt(i)!=delimiter){i++;} i++; if(i>=s.length() || !Tools.isDigit(s.charAt(i))){return -1;} long number=0; while(i<s.length()){ char c=s.charAt(i); if(c==delimiter || c==' ' || c=='\t'){break;} assert(Tools.isDigit(c)) : c+"\n"+s; number=(number*10)+(c-'0'); i++; } return number; } /** Parses sequence headers using NCBI's current header system, with Accessions. */ private TaxNode getNodeNewStyle(final String s){ int space=s.indexOf(' '); int number=-1; if(AccessionToTaxid.LOADED()){ if(space>0){ number=AccessionToTaxid.get(s.substring(0, space)); }else{ number=AccessionToTaxid.get(s); } } if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){ try { return getNode(Integer.parseInt(s)); } catch (NumberFormatException e) { //ignore } } if(number<0 && space>0){ number=parseNameToTaxid(s.substring(space+1)); } if(number>-1){return getNode(number);} if(space<0 && s.indexOf('_')>0){ return getNodeNewStyle(s.replace('_', ' ')); } return null; } /** * For parsing old-style NCBI headers. */ public int parseAccessionToTaxid(String[] split){ if(split.length<4){ return -1; } int ncbi=AccessionToTaxid.get(split[3]); return ncbi; } /** * For parsing old-style NCBI headers. */ public int parseHeaderNameToTaxid(String[] split){ if(split.length<5){ return -1; } return parseNameToTaxid(split[4]); } /** * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens"). * If multiple nodes share the same name, returns the first; to get the full list, * use getNodesByNameExtended. * @param name Organism name. * @return Organism TaxID, or -1 if not found. */ public int parseNameToTaxid(String name){ // assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size()); List<TaxNode> list=null; list=getNodesByNameExtended(name); if(list==null || list.size()>1){return -1;} return list.get(0).id; } /** * Fetch nodes indicated by this name. * @param name A taxonomic name delimited by space or underscore. * @return Nodes corresponding to the name. */ public List<TaxNode> getNodesByNameExtended(String name){ List<TaxNode> list=null; list=getNodesByName(name); if(list!=null){return list;} name=name.replaceAll("_", " ").trim(); list=getNodesByName(name); if(list!=null){return list;} String[] split2=name.split(" "); if(split2.length>7){ String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7]; list=getNodesByName(term); // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>6){ String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]; list=getNodesByName(term); // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>5){ String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]; list=getNodesByName(term); // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>4){ String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]; list=getNodesByName(term); // System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>3){ String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]; list=getNodesByName(term); // System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>2){ String term=split2[0]+" "+split2[1]+" "+split2[2]; list=getNodesByName(term); // System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>1){ String term=split2[0]+" "+split2[1]; list=getNodesByName(term); // System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } if(split2.length>0){ String term=split2[0]; list=getNodesByName(term); // System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); if(list!=null){return list;} } return null; } /*--------------------------------------------------------------*/ /*---------------- Assorted Methods ----------------*/ /*--------------------------------------------------------------*/ /** * Return the TaxID of the lowest ancestor node at least the specified level, * including this node itself. Level is the normal (non-extended) level. * @param taxID * @param taxLevel * @return */ public int promote(final int taxID, int taxLevel){ TaxNode tn=null; tn=(taxID<1 ? null : getNode(taxID)); tn=promote(tn, taxLevel); return (tn==null ? taxID : tn.id); } /** * Fetch the first node in this node's lineage of at least the indicated level. * This can be the node itself or an ancestor. * @see getNodeAtLevelExtended * @param tn Node in question * @param taxLevel Desired minimum level * @return A node at the desired level */ public TaxNode promote(TaxNode tn, int taxLevel){ while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){ TaxNode temp=getNode(tn.pid); if(temp==null || temp==tn || temp.level>=TaxTree.LIFE || temp.level>taxLevel){break;} tn=temp; } return tn; } /** * Determine the TaxID of the node's parent. * @param id TaxID of child node * @return Parent TaxID */ public int getParentID(int id){ assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." + "\nPlease redownload it and regenerate the taxtree."; if(id<0 || id>=nodes.length){return -1;} TaxNode tn=nodes[id]; if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);} return tn==null ? -1 : tn.pid; } /** * Fetch the node with this TaxID. * @param id TaxID * @return Node */ public TaxNode getNode(int id){ assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." + "\nPlease redownload it and regenerate the taxtree."; if(id<0 || id>=nodes.length){return null;} TaxNode tn=nodes[id]; if(tn!=null || mergedMap==null){return tn;} return getNode(mergedMap.get(id), true); } /** * Fetch the node with this TaxID, but don't throw assertions upon failure. * @param id TaxID * @return Node */ public TaxNode getNode(int id, boolean skipAssertion){ assert(skipAssertion || id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." + "\nPlease redownload it and regenerate the taxtree."; if(id<0 || id>=nodes.length){return null;} TaxNode tn=nodes[id]; if(tn!=null || mergedMap==null){return tn;} return getNode(mergedMap.get(id), true); } public TaxNode getNodeAtLevel(int id, int minLevel){ return getNodeAtLevel(id, minLevel, DOMAIN); } public TaxNode getNodeAtLevelExtended(int id, int minLevelE){ return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E); } public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){ final int minLevelExtended=levelToExtended(minLevel); final int maxLevelExtended=levelToExtended(maxLevel); return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended); } public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){ TaxNode tn=getNode(id); while(tn!=null && tn.pid!=tn.id && tn.levelExtended<minLevelE){ TaxNode temp=getNode(tn.pid); if(temp==null || temp.levelExtended>maxLevelE){break;} tn=temp; } return tn; } public int getIdAtLevelExtended(int taxID, int taxLevelExtended){ if(taxLevelExtended<0){return taxID;} TaxNode tn=getNode(taxID); while(tn!=null && tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){ tn=getNode(tn.pid); if(tn.levelExtended>taxLevelExtended){break;} taxID=tn.id; } return taxID; } /** * Fetch the node with this name. * Throw an assertion if there are multiple such nodes. * @param s Organism name. * @return Node with given name. */ public TaxNode getNodeByName(String s){ List<TaxNode> list=getNodesByName(s, false); if(list==null){list=getNodesByName(s, true);} if(list==null || list.isEmpty()){return null;} if(list.size()==1){return list.get(0);} assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n"; TaxNode a=list.get(0); for(int i=1; i<list.size(); i++){ TaxNode b=list.get(i); //Keep the most specific node // if(a==null || (b!=null && b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf())){//not necessary if(b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf()){ a=b; } } return a; } /** * Fetch a list of all nodes with this name. * @param s Organism name. * @return Nodes with given name. */ public List<TaxNode> getNodesByName(String s){ List<TaxNode> list=getNodesByName(s, false); if(list==null){list=getNodesByName(s, true);} return list; } /** * Fetch a map of names to nodes. If absent, create it first. * @param lowercase If true, return the map with lowercase keys. * @return Map of names to nodes. */ private HashMap<String, ArrayList<TaxNode>> getMap(boolean lowercase){ HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap); if(map==null){ synchronized(this){hashNames(true);} map=(lowercase ? nameMapLower : nameMap); } assert(map!=null) : "Tax names were not hashed."; return map; } private List<TaxNode> getNodesByName(String s, boolean lowercase){ if(s==null){return null;} if(s.indexOf('_')>=0){s=s.replace('_', ' ');} if(lowercase){s=s.toLowerCase();} // System.err.println("Searching for "+s); final HashMap<String, ArrayList<TaxNode>> map=getMap(lowercase); ArrayList<TaxNode> list=map.get(s); if(list!=null){return list;} // System.err.println("No matches for '"+s+"'"); // assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s); if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;} String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8); // System.err.println("Array: "+Arrays.toString(split)); list=map.get(split[split.length-1]); if(list==null){return list;} // System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")"); int matchCount=0; for(TaxNode tn : list){ if(tn.matchesName(split, split.length-1, this)){matchCount++;} } if(matchCount==list.size()){return list;} if(matchCount<1){return null;} ArrayList<TaxNode> hits=new ArrayList<TaxNode>(matchCount); for(TaxNode tn : list){ if(tn.matchesName(split, split.length-1, this)){hits.add(tn);} } return hits; } public ArrayList<TaxNode> getAncestors(int id){ TaxNode current=getNode(id); ArrayList<TaxNode> list=new ArrayList<TaxNode>(); while(current!=null && current.pid!=current.id){//ignores root list.add(current); current=getNode(current.pid); } //optionally add root here return list; } public void increment(IntList ids, IntList counts, boolean sync){ ids.sort(); ids.getUniqueCounts(counts); if(!sync){ for(int i=0; i<ids.size; i++){ int id=ids.get(i); int count=counts.get(i); incrementRaw(id, count); } }else{ synchronized(this){ for(int i=0; i<ids.size; i++){ int id=ids.get(i); int count=counts.get(i); incrementRaw(id, count); } } } } public void incrementRaw(int id, long amt){ assert(id>=0 && id<nodes.length) : "TaxID "+id+" is out of range."+(id<0 ? "" : " Possibly the taxonomy data needs to be updated."); assert(nodes[id]!=null) : "No node for TaxID "+id+"; possibly the taxonomy data needs to be updated."; nodes[id].incrementRaw(amt); } public void percolateUp(){ for(int i=0; i<treeLevelsExtended.length; i++){ percolateUp(i); } } public void percolateUp(final int fromLevel){ final TaxNode[] stratum=treeLevelsExtended[fromLevel]; for(final TaxNode n : stratum){ n.incrementSum(n.countRaw); TaxNode parent=nodes[n.pid]; if(n!=parent){ parent.incrementSum(n.countSum); } } } /** Add this amount to the node and all its ancestors. */ public void percolateUp(TaxNode node, long amt){ if(amt==0){return;} if(verbose){System.err.println("percolateUp("+amt+") node: "+node);} while(node.id!=node.pid){ node.incrementSum(amt); node=nodes[node.pid]; } node.incrementSum(amt); } public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){ return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1); } public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){ final int minLevelExtended=levelToExtended(minLevel); final int maxLevelExtended=levelToExtended(maxLevel); // assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended; ArrayList<TaxNode> list=new ArrayList<TaxNode>(); for(int i=minLevelExtended; i<nodesPerLevelExtended.length && i<=maxLevelExtended; i++){ list.addAll(gatherNodesAtLeastLimitExtended(i, limit)); } Shared.sort(list, TaxNode.countComparator); return list; } public ArrayList<TaxNode> gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){ ArrayList<TaxNode> list=new ArrayList<TaxNode>(); final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended]; for(final TaxNode n : stratum){ if(n.countSum>=limit){ list.add(n); TaxNode parent=nodes[n.pid]; if(n!=parent){ percolateUp(parent, -n.countSum);//123 This was negative for some reason } } } Shared.sort(list, TaxNode.countComparator); return list; } /*--------------------------------------------------------------*/ /*---------------- Static Initializers ----------------*/ /*--------------------------------------------------------------*/ /** * Generate the name to level number map. */ private static HashMap<String, Integer> makeLevelMap() { HashMap<String, Integer> map=new HashMap<String, Integer>(31); for(int i=0; i<taxLevelNames.length; i++){ map.put(taxLevelNames[i], i); map.put(taxLevelNames[i].toUpperCase(), i); } map.put("clade", NO_RANK); map.put("clade".toUpperCase(), NO_RANK); return map; } /** * Generate the name to extended level number map. */ private static HashMap<String, Integer> makeLevelMapExtended() { HashMap<String, Integer> map=new HashMap<String, Integer>(129); for(int i=0; i<taxLevelNamesExtended.length; i++){ map.put(taxLevelNamesExtended[i], i); map.put(taxLevelNamesExtended[i].toUpperCase(), i); } map.put("clade", NO_RANK_E); map.put("clade".toUpperCase(), NO_RANK_E); return map; } /** * I think this maps normal and extend names to normal level numbers. */ private static HashMap<String, Integer> makeAltLevelMap() { HashMap<String, Integer> map=new HashMap<String, Integer>(129); for(int i=0; i<taxLevelNames.length; i++){ map.put(taxLevelNames[i], i); map.put(taxLevelNames[i].toUpperCase(), i); } map.put("clade", NO_RANK); map.put("clade".toUpperCase(), NO_RANK); //Add synonyms // map.put("subfamily", map.get("family")); // map.put("tribe", map.get("family")); // map.put("varietas", map.get("subspecies")); // map.put("subgenus", map.get("genus")); // map.put("forma", map.get("subspecies")); // map.put("species group", map.get("genus")); // map.put("species subgroup", map.get("genus")); // map.put("cohort", map.get("class")); // map.put("subclass", map.get("class")); // map.put("infraorder", map.get("order")); // map.put("superorder", map.get("class")); // map.put("subphylum", map.get("phylum")); // map.put("infraclass", map.get("class")); // map.put("superkingdom", map.get("division")); // map.put("parvorder", map.get("order")); // map.put("superclass", map.get("phylum")); // map.put("superphylum", map.get("kingdom")); // map.put("subkingdom", map.get("kingdom")); // map.put("superfamily", map.get("order")); // map.put("superkingdom", map.get("domain")); // map.put("suborder", map.get("order")); // map.put("subtribe", map.get("family")); for(String[] array : taxLevelNamesExtendedMatrix){ String head=array[array.length-1]; Integer value=map.get(head); assert(value!=null) : head; for(String key : array){ if(key!=head){ assert(!map.containsKey(key)) : "Map already contains key "+key+": "+Arrays.toString(array); map.put(key, value); map.put(key.toUpperCase(), value); } } } return map; } /*--------------------------------------------------------------*/ /*---------------- Size ----------------*/ /*--------------------------------------------------------------*/ /** Number of bp associated with this node in RefSeq */ public long toSize(TaxNode tn){ if(tn==null){return 0;} if(refseqSizeMap==null){return -1L;} final long x=refseqSizeMap.get(tn.id); return Tools.max(0, x); } /** Number of bp associated with this node and descendants in RefSeq */ public long toSizeC(TaxNode tn){ if(tn==null){return 0;} if(refseqSizeMapC==null){return -1L;} final long x=refseqSizeMapC.get(tn.id); return Tools.max(0, x); } /** Number of sequences associated with this node in RefSeq */ public int toSeqs(TaxNode tn){ if(tn==null){return 0;} if(refseqSeqMap==null){return -1;} final int x=refseqSeqMap.get(tn.id); return Tools.max(0, x); } /** Number of sequences associated with this node and descandants in RefSeq */ public long toSeqsC(TaxNode tn){ if(tn==null){return 0;} if(refseqSeqMapC==null){return -1L;} final long x=refseqSeqMapC.get(tn.id); return Tools.max(0, x); } /** Number of descendants of this node */ public int toNodes(TaxNode tn){ if(tn==null){return 0;} if(nodeMapC==null){return -1;} final int x=nodeMapC.get(tn.id); return Tools.max(0, x); } /** * Fills refseqSizeMap, refseqSizeMapC, etc. from a file containing the summary. * @param fname Size file name */ public void loadSizeFile(String fname){ if(fname==null){return;} assert(refseqSizeMap==null); refseqSizeMap=new IntLongHashMap(); refseqSizeMapC=new IntLongHashMap(); refseqSeqMap=new IntHashMap(); refseqSeqMapC=new IntLongHashMap(); nodeMapC=new IntHashMap(); final ByteFile bf=ByteFile.makeByteFile(fname, true); final byte delimiter='\t'; for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ if(line.length>0 && line[0]!='#'){ int a=0, b=0; while(b<line.length && line[b]!=delimiter){b++;} assert(b>a) : "Missing field 0: "+new String(line); int tid=Parse.parseInt(line, a, b); b++; a=b; while(b<line.length && line[b]!=delimiter){b++;} assert(b>a) : "Missing field 1: "+new String(line); long size=Parse.parseLong(line, a, b); b++; a=b; while(b<line.length && line[b]!=delimiter){b++;} assert(b>a) : "Missing field 2: "+new String(line); long csize=Parse.parseLong(line, a, b); b++; a=b; while(b<line.length && line[b]!=delimiter){b++;} assert(b>a) : "Missing field 3: "+new String(line); int seqs=Parse.parseInt(line, a, b); b++; a=b; while(b<line.length && line[b]!=delimiter){b++;} assert(b>a) : "Missing field 4: "+new String(line); long cseqs=Parse.parseLong(line, a, b); b++; a=b; while(b<line.length && line[b]!=delimiter){b++;} assert(b>a) : "Missing field 5: "+new String(line); int cnodes=Parse.parseInt(line, a, b); b++; a=b; if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);} if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);} if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);} if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);} if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);} } } bf.close(); } /*--------------------------------------------------------------*/ /*---------------- IMG ----------------*/ /*--------------------------------------------------------------*/ public static int imgToTaxid(long img){ ImgRecord ir=imgMap.get(img); // assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n"; return ir==null ? -1 : ir.taxID; } public TaxNode imgToTaxNode(long img){ int tid=imgToTaxid(img); return tid<1 ? null : getNode(tid); } // public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){ // assert(imgMap==null); // if(fname==null){return 0;} // ImgRecord2.storeName=storeName; // if(outstream!=null){System.err.println("Loading IMG.");} // Timer t=new Timer(outstream, false); // ImgRecord2[] array=ImgRecord2.toArray(fname); // int x=loadIMG(array); // t.stopAndPrint(); // return x; // } public static int loadIMG(String fname, boolean storeName, PrintStream outstream){ assert(imgMap==null); if(fname==null){return 0;} ImgRecord.storeName=storeName; if(outstream!=null){System.err.println("Loading IMG.");} Timer t=new Timer(outstream, false); ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ); int x=loadIMG(array); t.stopAndPrint(); return x; } public static int loadIMG(ImgRecord[] array){ assert(imgMap==null); imgMap=new HashMap<Long, ImgRecord>((int)(array.length*1.5)); for(ImgRecord record : array){ imgMap.put(record.imgID, record); } return imgMap.size(); } @Deprecated public static int parseLevel(String b){ final int level; if(b==null){level=-1;} else if(Tools.isNumeric(b.charAt(0))){ level=Integer.parseInt(b); }else{ level=stringToLevel(b.toLowerCase()); } return level; } public static int parseLevelExtended(String b){ final int level; if(b==null){level=-1;} else if(Tools.isNumeric(b.charAt(0))){ level=levelToExtended(Integer.parseInt(b)); }else{ level=stringToLevelExtended(b.toLowerCase()); } return level; } public boolean isUnclassified(int tid){ TaxNode tn=getNode(tid); while(tn!=null && tn.id!=tn.pid){ if(tn.isUnclassified()){return true;} if(tn.pid==tn.id){break;} tn=getNode(tn.pid); } return false; } public boolean isEnvironmentalSample(int tid){ TaxNode tn=getNode(tid); while(tn!=null && tn.id!=tn.pid){ if(tn.isEnvironmentalSample()){return true;} if(tn.pid==tn.id){break;} tn=getNode(tn.pid); } return false; } public boolean isVirus(int tid){ TaxNode tn=getNode(tid); while(tn!=null && tn.id!=tn.pid){ if(tn.id==VIRUSES_ID){return true;} if(tn.pid==tn.id){break;} tn=getNode(tn.pid); } return false; } public long definedLevels(int tid){ long levels=0; TaxNode tn=getNode(tid); while(tn!=null && tn.id!=tn.pid){ levels=levels|(1L<<tn.level); } return levels; } public long definedLevelsExtended(int tid){ long levels=0; TaxNode tn=getNode(tid); while(tn!=null && tn.id!=tn.pid){ levels=levels|(1L<<tn.levelExtended); } return levels; } /** * Generates the mononomial name for this taxonomic level based on the scientific name. * For example, "Homo sapiens" -> "Sapiens" * @param tid TaxID * @return Correct name for this node. */ public String mononomial(int tid){return mononomial(getNode(tid));} public String mononomial(TaxNode tn){ if(tn==null){return null;} String name=tn.name; if(name.indexOf(' ')<0){return name;} TaxNode parent=getNode(tn.pid); if(parent==null){return name;} String pname=parent.name; if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){ name=name.substring(pname.length()+1); } return name; } /*--------------------------------------------------------------*/ /*---------------- Fields ----------------*/ /*--------------------------------------------------------------*/ /** All nodes in the tree in a flat array, indexed by TaxiD */ public final TaxNode[] nodes; /** Number of nodes per normal level */ public final int[] nodesPerLevel=new int[taxLevelNames.length]; /** Number of nodes per extended level */ public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length]; /** Number of nodes in the tree */ public final int nodeCount; /** Maps old TaxIDs to new TaxIDs */ public final IntHashMap mergedMap; /** Arrays of all nodes at a given taxonomic level (extended) */ public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][]; /** Map of names to nodes */ HashMap<String, ArrayList<TaxNode>> nameMap; /** Map of lowercase names to nodes */ HashMap<String, ArrayList<TaxNode>> nameMapLower; /** Map of nodes to child nodes */ HashMap<TaxNode, ArrayList<TaxNode>> childMap; public HashMap<String, ArrayList<TaxNode>> nameMap(){return nameMap;} @Deprecated public int minValidTaxa=0; //TODO: Remove (will break serialization) /** Infer ranks for no-rank nodes, when possible */ public boolean simplify=true; /** See simplify() for details, works in conjunction with simplify */ public boolean reassign=true; /** Discard no-rank nodes */ public boolean skipNorank=false; public int inferRankLimit=0;//levelMap.get("species"); //Node Statistics /** Number of bases assigned to this TaxID in RefSeq */ private IntLongHashMap refseqSizeMap; /** Number of bases assigned to this TaxID and descendants in RefSeq */ private IntLongHashMap refseqSizeMapC; /** Number of sequences assigned to this TaxID in RefSeq */ private IntHashMap refseqSeqMap; /** Number of sequences assigned to this TaxID and descendants in RefSeq */ private IntLongHashMap refseqSeqMapC; /** Number of descendant nodes, inclusive, for each TaxID */ private IntHashMap nodeMapC; /*--------------------------------------------------------------*/ /*---------------- Statics ----------------*/ /*--------------------------------------------------------------*/ /** Assign levels to unranked nodes below species level, when possible */ public static boolean assignStrains=true; /** Assume headers are in Silva format */ public static boolean SILVA_MODE=false; /** Assume headers are in Unite format */ public static boolean UNITE_MODE=false; /** Probably unnecessary at this point... present for legacy reasons */ public static boolean CRASH_IF_NO_GI_TABLE=true; public static boolean verbose=false; public static boolean SHOW_WARNINGS=false; /** Maps IMG IDs to records from the dump file */ private static HashMap<Long, ImgRecord> imgMap; /** Set to false if the tree is expected to be mutated. * @TODO Remove mutable fields from the tree (like counters). */ public static boolean ALLOW_SHARED_TREE=true; /** Universal location of the shared TaxTree used by various classes */ private static TaxTree sharedTree; /** A simpler and probably less safe version of sharedTree(...) */ public static TaxTree getTree(){return sharedTree;} /** * Fetch the shared tree, loading it from file if not present. * @return A tree. * @TODO: Check proper-construction of double-checked synchronize */ private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) { if(!ALLOW_SHARED_TREE){return null;} if(sharedTree==null && fname!=null){ if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();} synchronized(TaxTree.class){ if(sharedTree==null){ if(outstream!=null){outstream.println("Loading tax tree.");} Timer t=new Timer(outstream, false); setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat); t.stopAndPrint(); } } } if(hashNames && sharedTree.nameMap==null){ synchronized(sharedTree){ if(sharedTree.nameMap==null){ if(outstream!=null){outstream.println("Hashing names.");} Timer t=new Timer(outstream, false); sharedTree.hashNames(hashDotFormat); t.stopAndPrint(); } } } return sharedTree; } /** * For initialization. Normally only one tree is needed by a process so it is set here. * If the tree is already set nothing will happen, unless additional hashing is needed. */ private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){ assert(ALLOW_SHARED_TREE); assert(sharedTree==null); sharedTree=tree; if(hashNames && sharedTree.nameMap==null){ synchronized(sharedTree){ if(sharedTree.nameMap==null){ sharedTree.hashNames(hashDotFormat); } } } } /** * Determine whether a taxonomic level is standard. e.g.:<br> * isSimple("phylum")=true<br> * isSimple("subphylum")=false<br> * isSimple("no-rank")=false * @param levelExtended The extended level to test. * @return True if this level is not no-rank, and the names of the normal and extended levels match. */ public static boolean isSimple(int levelExtended){ int level=extendedToLevel(levelExtended); return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)); } /** * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:<br> * isSimple("phylum")=true<br> * isSimple("substrain")=true<br> * isSimple("subphylum")=false<br> * isSimple("no-rank")=false * @param levelExtended The extended level to test. * @return True if this level is not no-rank, and the names of the normal and extended levels match. */ public static boolean isSimple2(int levelExtended){ int level=extendedToLevel(levelExtended); return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level) || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E); } /*--------------------------------------------------------------*/ /*---------------- Constants ----------------*/ /*--------------------------------------------------------------*/ /** Get the number for the normal level of this name */ public static final int stringToLevel(String s){return altLevelMap.get(s);} public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);} /** Get the number for the extended level of this name */ public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);} /** Get the normal name for this normal level */ public static final String levelToString(int x){return taxLevelNames[x];} /** Get the extended name for this extended level */ public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];} /** Get the abbreviated name for this normal level */ public static final String levelToStringShort(int x){return taxLevelNamesShort[x];} /** Normal, aka canonical, aka simple tax level names */ private static final String[] taxLevelNames=new String[] { "no rank", "subspecies", "species", "genus", "family", "order", "class", "phylum", "kingdom", "superkingdom", "domain", "life" }; public static final int numTaxLevelNames=taxLevelNames.length; /** * Definitive representation of all NCBI taxonomic level names. * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash. * The first dimension maps normal ranks to extended ranks. * Both dimensions are ordered ascending. * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels(). */ //TODO See @TODO private static final String[][] taxLevelNamesExtendedMatrix=new String[][] { {"no rank"}, {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup", "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas", "subspecies"}, {"species"}, {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"}, {"subtribe", "tribe", "subfamily", "family"}, {"superfamily", "parvorder", "infraorder", "suborder", "order"}, {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"}, {"superclass", "subdivision", "division", "subphylum", "phylum"}, {"superphylum", "subkingdom", "kingdom"}, {"superkingdom"}, {"domain"}, {"life"} }; /** Extended tax level names as a 1D array */ private static final String[] taxLevelNamesExtended=makeNamesExtended(); /** Number of extended tax levels */ public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length; /** Flatten the extended tax level names matrix to a 1D array */ private static final String[] makeNamesExtended(){ ArrayList<String> list=new ArrayList<String>(); for(String[] s : taxLevelNamesExtendedMatrix){ for(String ss : s){ list.add(ss); } } return list.toArray(new String[0]); } /** Abbreviations of tax level names, mainly for semicolon form */ private static final String[] taxLevelNamesShort=new String[] { "nr", "ss", "s", "g", "f", "o", "c", "p", "k", "sk", "d", "l" }; /** Normal tax level numbers as constants */ public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3, FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11; /** TaxID of Life node */ public static final int LIFE_ID=1; /** TaxID of Cellular Organisms node */ public static final int CELLULAR_ORGANISMS_ID=131567; /** TaxID of Bacteria node */ public static final int BACTERIA_ID=2; //Is this safe? Who knows... /** TaxID of Archaea node */ public static final int ARCHAEA_ID=2157; /** TaxID of Euk node */ public static final int EUKARYOTA_ID=2759; /** TaxID of Animal node */ public static final int METAZOA_ID=33208, ANIMALIA_ID=33208; /** TaxID of Plant node */ public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090; /** TaxID of Fungi node */ public static final int FUNGI_ID=4751; /** TaxID of Virus node */ public static final int VIRUSES_ID=10239; /** TaxID of Viroids node (now defunct) */ public static final int VIROIDS_ID=12884; /** Maps normal level names to normal level numbers */ private static final HashMap<String, Integer> levelMap=makeLevelMap(); /** Maps extended level names to extended level numbers */ private static final HashMap<String, Integer> levelMapExtended=makeLevelMapExtended(); /** Maps extended level names to normal level numbers */ private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap(); /** Common extended level numbers as constants */ public static final int NO_RANK_E=NO_RANK, SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"), SUBSPECIES_E=stringToLevelExtended("subspecies"), SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"), FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"), CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"), KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"), DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life"); /** Map of normal to extended level numbers */ private static final int[] levelToExtended=new int[] { NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E, ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E }; /** Map of extended to normal level numbers */ private static final int[] extendedToLevel=makeExtendedToLevel(); /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */ private static int[] makeExtendedToLevel(){ int len=0; for(String[] array : taxLevelNamesExtendedMatrix){ len+=array.length; } int[] ret=new int[len]; int pos=0; for(int level=0; level<taxLevelNamesExtendedMatrix.length; level++){ String[] array=taxLevelNamesExtendedMatrix[level]; for(int i=0; i<array.length; i++){ ret[pos]=level; pos++; } } return ret; } /** Convert a standard level number (like KINGDOM) to extended (like KINGDOM_E). */ public static final int levelToExtended(int level){ return level<0 ? level : levelToExtended[level]; } /** Convert an extended level number (like PHYLUM_E) to extended (like PHYLUM). * Non-standard levels will be converted to the next higher standard level; * e.g., subphylum -> phylum */ public static final int extendedToLevel(int extended){ return extended<0 ? -1 : extendedToLevel[extended]; } /* Pre-compiled delimiters to save time when splitting lines */ private static final Pattern delimiterTab = Pattern.compile("\t"); private static final Pattern delimiter = Pattern.compile("\t\\|\t"); private static final Pattern delimiterPipe = Pattern.compile("\\|"); private static final Pattern delimiterTilde = Pattern.compile("\\~"); private static final Pattern delimiter2 = Pattern.compile("[\\s_]+"); public static boolean IMG_HQ=false; /* For these fields, see the corresponding functions, below. * They define the default paths to various data on NERSC. */ private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest"; private static final String defaultTaxPathAws="/test1/tax/latest"; private static final String defaultTaxPathIGBVM="/data/tax/latest"; private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz"; private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz"; private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz"; private static final String defaultImgFile="TAX_PATH/imgDump.txt"; private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz"; private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz"; private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz"; private static final String defaultPatternFile="TAX_PATH/patterns.txt"; private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz"; private static final String defaultAccessionFile= //"TAX_PATH/shrunk.protF.accession2taxid.gz," + "TAX_PATH/shrunk.prot.accession2taxid.gz," + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz," + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz," + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz," // + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz," + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz," // + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz," + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz," + "TAX_PATH/shrunk.pdb.accession2taxid.gz"; /** For setting TAX_PATH, the root to taxonomy files */ public static final String defaultTaxPath(){ return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc; } /** 16S consensus sequences per TaxID */ public static final String default16SFile(){ return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc; } /** 18S consensus sequences per TaxID */ public static final String default18SFile(){ return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc; } /** Path to all taxonomy files, substituted in to make specific file paths */ public static String TAX_PATH=defaultTaxPath(); /** Location of gitable.int2d.gz for gi lookups */ public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);} /** Location of tree.taxtree.gz */ public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);} //Use the prot.FULL.gz ncbi file. public static boolean protFull=false; /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */ public static final String defaultAccessionFile(){ String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile; return s.replaceAll("TAX_PATH", TAX_PATH); } /** Location of patterns.txt, which holds information about observed accession string formats */ public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);} /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */ public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);} /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */ public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);} /** Screen output gets printed here */ private static PrintStream outstream=System.out; }