jpayne@68: package tax; jpayne@68: jpayne@68: import java.io.PrintStream; jpayne@68: import java.io.Serializable; jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.Arrays; jpayne@68: import java.util.HashMap; jpayne@68: import java.util.LinkedHashMap; jpayne@68: import java.util.List; jpayne@68: import java.util.regex.Pattern; jpayne@68: jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.ReadWrite; jpayne@68: import fileIO.TextFile; jpayne@68: import shared.Parse; jpayne@68: import shared.Parser; jpayne@68: import shared.PreParser; jpayne@68: import shared.Shared; jpayne@68: import shared.Timer; jpayne@68: import shared.Tools; jpayne@68: import structures.ByteBuilder; jpayne@68: import structures.IntHashMap; jpayne@68: import structures.IntList; jpayne@68: import structures.IntLongHashMap; jpayne@68: jpayne@68: /** jpayne@68: * Represents a taxonomic tree. jpayne@68: * Usually just one of these needs to be created for a process. jpayne@68: * Designed for NCBI's taxdmp.zip file contents. jpayne@68: * @author Brian Bushnell jpayne@68: * @date Mar 6, 2015 jpayne@68: * jpayne@68: */ jpayne@68: public class TaxTree implements Serializable{ jpayne@68: jpayne@68: /** jpayne@68: * jpayne@68: */ jpayne@68: private static final long serialVersionUID = 5894416521711540017L; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Main ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Code entrance from the command line. jpayne@68: * This is not called normally, only when converting NCBI text files jpayne@68: * into a binary representation and writing it to disk. jpayne@68: * @param args Command line arguments jpayne@68: */ jpayne@68: public static void main(String[] args){ jpayne@68: jpayne@68: {//Preparse block for help, config files, and outstream jpayne@68: PreParser pp=new PreParser(args, outstream, null, false); jpayne@68: args=pp.args; jpayne@68: outstream=pp.outstream; jpayne@68: } jpayne@68: jpayne@68: assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n"; jpayne@68: ReadWrite.USE_UNPIGZ=true; jpayne@68: ReadWrite.USE_PIGZ=true; jpayne@68: ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9); jpayne@68: ReadWrite.PIGZ_BLOCKSIZE=256; jpayne@68: ReadWrite.PIGZ_ITERATIONS=60; jpayne@68: jpayne@68: Timer t=new Timer(); jpayne@68: TaxTree tree=new TaxTree(args); jpayne@68: t.stop(); jpayne@68: jpayne@68: outstream.println("Retained "+tree.nodeCount+" nodes:"); jpayne@68: jpayne@68: for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){ jpayne@68: outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]); jpayne@68: if(verbose){ jpayne@68: int lim=10; jpayne@68: for(int j=0; j "+tree.nodes[n.pid]); jpayne@68: } jpayne@68: for(int j=tree.treeLevelsExtended[i].length-lim; j=lim){ jpayne@68: TaxNode n=tree.treeLevelsExtended[i][j]; jpayne@68: outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: outstream.println(); jpayne@68: } jpayne@68: jpayne@68: jpayne@68: outstream.println(); jpayne@68: outstream.println("Time: \t"+t); jpayne@68: jpayne@68: if(args.length>2){//Write a tree jpayne@68: ReadWrite.write(tree, args[3], true); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /** Parse arguments from the command line */ jpayne@68: private Parser parse(String[] args){ jpayne@68: jpayne@68: //Create a parser object jpayne@68: Parser parser=new Parser(); jpayne@68: jpayne@68: //Set any necessary Parser defaults here jpayne@68: //parser.foo=bar; jpayne@68: jpayne@68: //Parse each argument jpayne@68: for(int i=0; i1 ? split[1] : null; jpayne@68: if(b!=null && b.equalsIgnoreCase("null")){b=null;} jpayne@68: jpayne@68: if(a.equals("verbose")){ jpayne@68: verbose=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("parse_flag_goes_here")){ jpayne@68: long fake_variable=Parse.parseKMG(b); jpayne@68: //Set a variable here jpayne@68: }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser jpayne@68: //do nothing jpayne@68: }else if(i>3){ jpayne@68: outstream.println("Unknown parameter "+args[i]); jpayne@68: assert(false) : "Unknown parameter "+args[i]; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: return parser; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Initialization ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Constructors ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Constructor using filenames from command line arguments, in the format of: jpayne@68: * {names, nodes, merged} jpayne@68: * @param args Command line arguments jpayne@68: */ jpayne@68: private TaxTree(String[] args){ jpayne@68: this(args[0], args[1], args[2], args); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * @param namesFile NCBI names.txt jpayne@68: * @param nodesFile NCBI nodes.txt jpayne@68: * @param mergedFile NCBI merged.txt jpayne@68: * @param args jpayne@68: */ jpayne@68: private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){ jpayne@68: jpayne@68: if(args!=null) { jpayne@68: Parser parser=parse(args); jpayne@68: } jpayne@68: jpayne@68: nodes=getNames(namesFile); jpayne@68: getNodes(nodesFile, nodes); jpayne@68: jpayne@68: mergedMap=getMerged(mergedFile); jpayne@68: jpayne@68: countChildren(); jpayne@68: outstream.println("Counted children."); jpayne@68: int rounds=percolate(); jpayne@68: outstream.println("Percolated "+rounds+" rounds to fixpoint."); jpayne@68: jpayne@68: if(assignStrains){ jpayne@68: assignStrains(); jpayne@68: rounds=percolate(); jpayne@68: outstream.println("Percolated "+rounds+" rounds to fixpoint."); jpayne@68: } jpayne@68: jpayne@68: if(simplify){ jpayne@68: if(verbose){outstream.println("Simplifying.");} jpayne@68: int removed=simplify(nodes); jpayne@68: if(verbose){outstream.println("Removed "+removed+" nodes.");} jpayne@68: rounds=percolate(); jpayne@68: outstream.println("Percolated "+rounds+" rounds to fixpoint."); jpayne@68: } jpayne@68: int errors=test(nodes); jpayne@68: // assert(errors==0); //Not possible since the tree is wrong. jpayne@68: if(errors>0) { jpayne@68: System.err.println("Found "+errors+" errors in tree."); jpayne@68: } jpayne@68: jpayne@68: for(TaxNode n : nodes){ jpayne@68: if(n!=null){ jpayne@68: nodesPerLevel[n.level]++; jpayne@68: nodesPerLevelExtended[n.levelExtended]++; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: // for(int i=0; i bactList=new ArrayList(); jpayne@68: ArrayList archList=new ArrayList(); jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){ jpayne@68: if(descendsFrom(tn, bacteria)){ jpayne@68: bactList.add(tn); jpayne@68: }else if(descendsFrom(tn, archaea)){ jpayne@68: archList.add(tn); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: ArrayList prokList=new ArrayList(bactList.size()+archList.size()); jpayne@68: prokList.addAll(bactList); jpayne@68: prokList.addAll(archList); jpayne@68: jpayne@68: for(TaxNode tn : prokList){ jpayne@68: if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ jpayne@68: TaxNode parent=nodes[tn.pid]; jpayne@68: if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ jpayne@68: tn.levelExtended=STRAIN_E; jpayne@68: tn.level=SUBSPECIES; jpayne@68: tn.setOriginalLevel(STRAIN_E); jpayne@68: strains++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: // outstream.println("Assigned "+strains+" strains."); jpayne@68: for(TaxNode tn : prokList){ jpayne@68: if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ jpayne@68: TaxNode parent=nodes[tn.pid]; jpayne@68: if(parent.levelExtended==STRAIN_E){ jpayne@68: tn.levelExtended=SUBSTRAIN_E; jpayne@68: tn.level=SUBSPECIES; jpayne@68: tn.setOriginalLevel(SUBSTRAIN_E); jpayne@68: substrains++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: // outstream.println("Assigned "+substrains+" substrains."); jpayne@68: } jpayne@68: jpayne@68: @Deprecated jpayne@68: private void assignStrainsOld(){ jpayne@68: jpayne@68: outstream.println("Assigning strains."); jpayne@68: int strains=0, substrains=0; jpayne@68: TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed jpayne@68: assert(bacteria.name.equalsIgnoreCase("Bacteria")); jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null && tn.originalLevel()==NO_RANK){ jpayne@68: TaxNode parent=nodes[tn.pid]; jpayne@68: if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){ jpayne@68: // nodesPerLevelExtended[STRAIN_E]++; jpayne@68: // nodesPerLevelExtended[tn.levelExtended]--; jpayne@68: tn.levelExtended=STRAIN_E; jpayne@68: tn.level=SUBSPECIES; jpayne@68: tn.setOriginalLevel(STRAIN_E); jpayne@68: strains++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: // outstream.println("Assigned "+strains+" strains."); jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null && tn.originalLevel()==NO_RANK){ jpayne@68: TaxNode parent=nodes[tn.pid]; jpayne@68: if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){ jpayne@68: // nodesPerLevelExtended[SUBSTRAIN_E]++; jpayne@68: // nodesPerLevelExtended[tn.levelExtended]--; jpayne@68: tn.levelExtended=SUBSTRAIN_E; jpayne@68: tn.level=SUBSPECIES; jpayne@68: tn.setOriginalLevel(SUBSTRAIN_E); jpayne@68: substrains++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: // outstream.println("Assigned "+substrains+" substrains."); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Construction ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Create tax nodes using names in the designated file. jpayne@68: * @param fname NCBI names.txt jpayne@68: * @return Array of created nodes, where array[x] contains the node with TaxID x. jpayne@68: */ jpayne@68: private static TaxNode[] getNames(String fname){ jpayne@68: ArrayList list=new ArrayList(200000); jpayne@68: int max=0; jpayne@68: jpayne@68: TextFile tf=new TextFile(fname, false); jpayne@68: for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ jpayne@68: if(s.contains("scientific name")){ jpayne@68: String[] split=delimiter.split(s, 3); jpayne@68: assert(split.length==3) : s; jpayne@68: int id=Integer.parseInt(split[0]); jpayne@68: String name=split[1]; jpayne@68: if(id==1 && name.equalsIgnoreCase("root")){name="Life";} jpayne@68: max=Tools.max(max, id); jpayne@68: list.add(new TaxNode(id, name)); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: TaxNode[] nodes=new TaxNode[max+1]; jpayne@68: for(TaxNode n : list){ jpayne@68: assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n; jpayne@68: nodes[n.id]=n; jpayne@68: } jpayne@68: jpayne@68: return nodes; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Parses names file a second time to fill in additional information. jpayne@68: * Should really be merged into getNames. jpayne@68: * @TODO Merge into getNames jpayne@68: */ jpayne@68: private static TaxNode[] getNodes(String fname, TaxNode[] nodes){ jpayne@68: jpayne@68: int max=0; jpayne@68: jpayne@68: LinkedHashMap oddNames=new LinkedHashMap(); jpayne@68: jpayne@68: TextFile tf=new TextFile(fname, false); jpayne@68: for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ jpayne@68: String[] split=delimiter.split(s, 4); jpayne@68: assert(split.length==4) : s; jpayne@68: int id=-1, pid=-1, level=-1, levelExtended=-1; jpayne@68: jpayne@68: id=Integer.parseInt(split[0]); jpayne@68: try { jpayne@68: pid=Integer.parseInt(split[1]); jpayne@68: } catch (NumberFormatException e) { jpayne@68: // TODO Auto-generated catch block jpayne@68: e.printStackTrace(); jpayne@68: System.err.println("Bad line: "+s+"\n"+Arrays.toString(split)); jpayne@68: } jpayne@68: boolean alt=false; jpayne@68: { jpayne@68: String key=split[2]; jpayne@68: Integer obj0=levelMap.get(key); jpayne@68: Integer obj=levelMapExtended.get(key); jpayne@68: assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split); jpayne@68: jpayne@68: if(obj0==null){ jpayne@68: obj0=altLevelMap.get(key); jpayne@68: alt=true; jpayne@68: } jpayne@68: if(obj0!=null){ jpayne@68: level=obj0; jpayne@68: levelExtended=obj; jpayne@68: if(id==pid){ jpayne@68: level=LIFE; jpayne@68: levelExtended=LIFE_E; jpayne@68: alt=false; jpayne@68: } jpayne@68: }else{ jpayne@68: if(id==pid){ jpayne@68: level=LIFE; jpayne@68: levelExtended=LIFE_E; jpayne@68: alt=false; jpayne@68: }else{ jpayne@68: int[] count=oddNames.get(key); jpayne@68: if(count==null){ jpayne@68: count=new int[1]; jpayne@68: oddNames.put(key, count); jpayne@68: } jpayne@68: count[0]++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: max=Tools.max(max, id); jpayne@68: TaxNode n=nodes[id]; jpayne@68: assert(n!=null && n.pid<0) : n+" -> "+s; jpayne@68: n.pid=pid; jpayne@68: n.level=level; jpayne@68: n.levelExtended=levelExtended; jpayne@68: n.setOriginalLevel(levelExtended); jpayne@68: n.setCanonical(!alt); jpayne@68: assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n"; jpayne@68: } jpayne@68: jpayne@68: if(oddNames.size()>0){ jpayne@68: outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:"); jpayne@68: if(verbose){ jpayne@68: for(String s : oddNames.keySet()){ jpayne@68: outstream.println(oddNames.get(s)[0]+"\t"+s); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: return nodes; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Count child nodes of each node. jpayne@68: * This can be used to size arrays or determine which nodes are leaves. jpayne@68: */ jpayne@68: private void countChildren(){ jpayne@68: for(TaxNode child : nodes){ jpayne@68: if(child!=null && child.pid!=child.id){ jpayne@68: TaxNode parent=getNode(child.pid); jpayne@68: if(parent!=child){parent.numChildren++;} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: //TODO - This could be finished in 2 passes using the childTable. jpayne@68: /** jpayne@68: * Fill derived fields minParentLevelExtended and maxChildLevelExtended jpayne@68: * by percolating information through the tree until a fixpoint is reached. jpayne@68: * @TODO This could be finished in 2 passes using the childTable. jpayne@68: * @return Number of rounds required to reach fixpoint. jpayne@68: */ jpayne@68: private int percolate(){ jpayne@68: boolean changed=true; jpayne@68: int rounds=0; jpayne@68: while(changed){ jpayne@68: changed=false; jpayne@68: rounds++; jpayne@68: for(TaxNode child : nodes){ jpayne@68: if(child!=null && child.pid!=child.id){ jpayne@68: TaxNode parent=getNode(child.pid); jpayne@68: changed=(child.discussWithParent(parent) | changed); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(!changed){break;} jpayne@68: changed=false; jpayne@68: rounds++; jpayne@68: for(int i=nodes.length-1; i>=0; i--){ jpayne@68: TaxNode child=nodes[i]; jpayne@68: if(child!=null && child.pid!=child.id){ jpayne@68: TaxNode parent=getNode(child.pid); jpayne@68: changed=(child.discussWithParent(parent) | changed); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return rounds; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Load nodes into the nameMap and nameMapLower, mapped to their names. jpayne@68: * @param genusDotSpecies Also hash abbreviations such as E.coli. jpayne@68: */ jpayne@68: public synchronized void hashNames(boolean genusDotSpecies){ jpayne@68: if(nameMap!=null){return;} jpayne@68: assert(nameMap==null); jpayne@68: assert(nameMapLower==null); jpayne@68: final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN)); jpayne@68: nameMap=new HashMap>(size); jpayne@68: nameMapLower=new HashMap>(size); jpayne@68: jpayne@68: //Hash the names, both lowercase and uppercase jpayne@68: for(TaxNode n : nodes){ jpayne@68: if(n!=null){ jpayne@68: String name=n.name; jpayne@68: if(name.indexOf('_')>=0){ jpayne@68: name=name.replace('_', ' ').trim(); jpayne@68: } jpayne@68: if(name!=null && !name.equals("environmental samples")){ jpayne@68: { jpayne@68: ArrayList list=nameMap.get(name); jpayne@68: if(list==null){ jpayne@68: list=new ArrayList(); jpayne@68: nameMap.put(name, list); jpayne@68: } jpayne@68: list.add(n); jpayne@68: } jpayne@68: { jpayne@68: String lc=name.toLowerCase(); jpayne@68: ArrayList list=nameMapLower.get(lc); jpayne@68: if(list==null){ jpayne@68: list=new ArrayList(); jpayne@68: nameMapLower.put(lc, list); jpayne@68: } jpayne@68: list.add(n); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: //Hash G.species versions of the names, both lowercase and uppercase jpayne@68: if(genusDotSpecies){ jpayne@68: ByteBuilder bb=new ByteBuilder(64); jpayne@68: for(TaxNode n : nodes){ jpayne@68: if(n!=null && n.levelExtended==SPECIES_E){ jpayne@68: String name=n.name; jpayne@68: if(name.indexOf('_')>=0){ jpayne@68: name=name.replace('_', ' ').trim(); jpayne@68: } jpayne@68: if(name!=null && !name.equals("environmental samples")){ jpayne@68: final String dotFormat=dotFormat(name, bb); jpayne@68: if(dotFormat!=null){ jpayne@68: { jpayne@68: ArrayList list=nameMap.get(dotFormat); jpayne@68: if(list==null){ jpayne@68: list=new ArrayList(); jpayne@68: nameMap.put(dotFormat, list); jpayne@68: } jpayne@68: list.add(n); jpayne@68: } jpayne@68: { jpayne@68: String lc=dotFormat.toLowerCase(); jpayne@68: ArrayList list=nameMapLower.get(lc); jpayne@68: if(list==null){ jpayne@68: list=new ArrayList(); jpayne@68: nameMapLower.put(lc, list); jpayne@68: } jpayne@68: list.add(n); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Generate the "dot format" name of a node. jpayne@68: * For example, transform "Homo sapiens" to "H.sapiens" jpayne@68: * @param name Node name jpayne@68: * @param buffer A ByteBuilder that may be modified jpayne@68: * @return Dot format jpayne@68: */ jpayne@68: private static String dotFormat(String name, ByteBuilder buffer){ jpayne@68: if(name==null || name.indexOf('.')>=0){return null;} jpayne@68: final int firstSpace=name.indexOf(' '); jpayne@68: if(firstSpace<0 || firstSpace>=name.length()-1){return null;} jpayne@68: final int lastSpace=name.lastIndexOf(' '); jpayne@68: if(firstSpace!=lastSpace){return null;} jpayne@68: final String a=name.substring(0, firstSpace); jpayne@68: final String b=name.substring(lastSpace+1); jpayne@68: final char ca=a.charAt(0); jpayne@68: final char cb=b.charAt(0); jpayne@68: if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;} jpayne@68: if(buffer==null){buffer=new ByteBuilder(2+b.length());} jpayne@68: else{buffer.clear();} jpayne@68: buffer.append(ca).append('.').append(b); jpayne@68: return buffer.toString(); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fill childMap, which maps nodes to their children. jpayne@68: */ jpayne@68: public synchronized void hashChildren(){ jpayne@68: assert(childMap==null); jpayne@68: int nodesWithChildren=0; jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null && tn.numChildren>0){nodesWithChildren++;} jpayne@68: } jpayne@68: childMap=new HashMap>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN)); jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null){ jpayne@68: if(tn.numChildren>0){ jpayne@68: childMap.put(tn, new ArrayList(tn.numChildren)); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null){ jpayne@68: if(tn.id!=tn.pid){ jpayne@68: ArrayList list=childMap.get(getNode(tn.pid)); jpayne@68: if(list!=null){list.add(tn);} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch this node's children. jpayne@68: * @param parent Node in question jpayne@68: * @return List of child nodes jpayne@68: */ jpayne@68: public ArrayList getChildren(TaxNode parent){ jpayne@68: if(parent.numChildren<1){return null;} jpayne@68: if(childMap!=null){return childMap.get(parent);} jpayne@68: ArrayList list=new ArrayList(parent.numChildren); jpayne@68: for(TaxNode tn : nodes){ jpayne@68: if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){ jpayne@68: list.add(tn); jpayne@68: } jpayne@68: } jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Load a map of old to new TaxIDs. jpayne@68: * @param mergedFile NCBI merged.txt. jpayne@68: * @return Map of old to new TaxIDs jpayne@68: */ jpayne@68: private static IntHashMap getMerged(String mergedFile) { jpayne@68: if(mergedFile==null){return null;} jpayne@68: String[] lines=TextFile.toStringLines(mergedFile); jpayne@68: if(lines.length<1){return null;} jpayne@68: IntHashMap map=new IntHashMap((int)(lines.length*1.3)); jpayne@68: for(String line : lines){ jpayne@68: String[] split=delimiterTab.split(line); jpayne@68: int a=Integer.parseInt(split[0]); jpayne@68: int b=Integer.parseInt(split[2]); jpayne@68: map.put(a, b); jpayne@68: } jpayne@68: return map; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Simplify the tree by assigning ranks to unranked nodes, jpayne@68: * where possible, through inference. jpayne@68: * Optionally removes unranked nodes based on the skipNorank field. jpayne@68: * @param nodes Array of all TaxNodes. jpayne@68: * @return Number of nodes removed. jpayne@68: */ jpayne@68: private int simplify(TaxNode nodes[]){ jpayne@68: jpayne@68: int failed=test(nodes); jpayne@68: jpayne@68: int removed=0; jpayne@68: int reassigned=0; jpayne@68: jpayne@68: if(reassign){ jpayne@68: boolean changed=true; jpayne@68: int changedCount=0; jpayne@68: while(changed){ jpayne@68: changed=false; jpayne@68: for(int i=0; i=SUBSPECIES_E){ jpayne@68: // if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ jpayne@68: changed=true; jpayne@68: n.levelExtended=SUBSPECIES_E; jpayne@68: n.level=SUBSPECIES; jpayne@68: changedCount++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: System.err.println("Assigned levels to "+changedCount+" unranked nodes."); jpayne@68: } jpayne@68: jpayne@68: jpayne@68: if(skipNorank){//Skip nodes with unknown taxa jpayne@68: if(verbose){outstream.println("A0");} jpayne@68: jpayne@68: for(int i=0; iparent.levelExtended){ jpayne@68: //System.err.println("Reassigned from "+parent); jpayne@68: assert(parent.id!=parent.pid); jpayne@68: parent=nodes[parent.pid]; jpayne@68: n.pid=parent.id; jpayne@68: reassigned++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: for(int i=0; i0){//Infer level for unset nodes (from "no rank") jpayne@68: if(verbose){outstream.println("A");} jpayne@68: int changed=1; jpayne@68: while(changed>0){ jpayne@68: changed=0; jpayne@68: for(final TaxNode n : nodes){ jpayne@68: if(n!=null){ jpayne@68: if(n.levelExtended==0){ jpayne@68: TaxNode parent=nodes[n.pid]; jpayne@68: if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){ jpayne@68: n.levelExtended=Tools.max(1, parent.levelExtended-1); jpayne@68: assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit); jpayne@68: changed++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(verbose){outstream.println("changed: "+changed);} jpayne@68: } jpayne@68: jpayne@68: // outstream.println("B"); jpayne@68: // for(TaxNode n : nodes){ jpayne@68: // if(n!=null && n.level==0){ jpayne@68: // n.level=-1; jpayne@68: // } jpayne@68: // } jpayne@68: } jpayne@68: jpayne@68: failed=test(nodes); jpayne@68: jpayne@68: // if(reassign){//Skip nodes with duplicate taxa jpayne@68: // if(verbose){outstream.println("D");} jpayne@68: // int changed=1; jpayne@68: // while(changed>0){ jpayne@68: // changed=0; jpayne@68: // for(final TaxNode n : nodes){ jpayne@68: // if(n!=null){ jpayne@68: // TaxNode parent=nodes[n.pid]; jpayne@68: // TaxNode grandparent=nodes[parent.pid]; jpayne@68: // assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent; jpayne@68: // assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent; jpayne@68: // jpayne@68: // while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) || jpayne@68: // n.level>parent.level || (n.level==parent.level))){ jpayne@68: // parent=grandparent; jpayne@68: // grandparent=nodes[parent.pid]; jpayne@68: // n.pid=parent.id; jpayne@68: // reassigned++; jpayne@68: // changed++; jpayne@68: // } jpayne@68: // } jpayne@68: // } jpayne@68: // if(verbose){outstream.println("changed: "+changed);} jpayne@68: // } jpayne@68: // if(verbose){outstream.println("E");} jpayne@68: // for(int i=0; i "+parent+" -> "+grandparent; jpayne@68: assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: // if(verbose){System.err.println("Reassignments: "+reassigned);} jpayne@68: jpayne@68: return removed; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Validation ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Ensure tree has monotonically increasing (or nondescending) ranks. jpayne@68: * @param nodes All TaxNodes. jpayne@68: * @return Number of violations. jpayne@68: */ jpayne@68: private static int test(TaxNode[] nodes){ jpayne@68: int failed=0; jpayne@68: for(final TaxNode n : nodes){ jpayne@68: if(n!=null){ jpayne@68: TaxNode parent=nodes[n.pid]; jpayne@68: try { jpayne@68: assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) : jpayne@68: "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n" jpayne@68: + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended; jpayne@68: assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent; jpayne@68: // assert(n==parent || n.level "+parent; jpayne@68: if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){ jpayne@68: if(verbose){outstream.println("Error: "+n+" -> "+parent);} jpayne@68: failed++; jpayne@68: }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){ jpayne@68: // if(verbose){outstream.println("Error: "+n+" -> "+parent);} jpayne@68: // failed++; jpayne@68: } jpayne@68: assert(n!=parent || n.id<=1) : n; jpayne@68: } catch (Throwable e) { jpayne@68: // TODO Auto-generated catch block jpayne@68: e.printStackTrace(); jpayne@68: failed++; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(verbose || failed>0){outstream.println(failed+" nodes failed.");} jpayne@68: return failed; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Print Methods ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Format full name in semicolon format, e.g. jpayne@68: * "SK:Bacteria;P:Protobacteria;..." jpayne@68: * @param tn0 Base node jpayne@68: * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. jpayne@68: * @return Resultant String jpayne@68: */ jpayne@68: public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){ jpayne@68: StringBuilder sb=new StringBuilder(); jpayne@68: if(tn0==null){return "Not found";} jpayne@68: String semi=""; jpayne@68: ArrayList list=toAncestors(tn0, skipNonCanonical); jpayne@68: boolean addTaxLevel=true; jpayne@68: for(int i=list.size()-1; i>=0; i--){ jpayne@68: sb.append(semi); jpayne@68: TaxNode tn=list.get(i); jpayne@68: if(tn.id!=LIFE_ID || list.size()==1){ jpayne@68: if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){ jpayne@68: sb.append(tn.levelToStringShort()).append(':'); jpayne@68: } jpayne@68: sb.append(mononomial ? mononomial(tn) : tn.name); jpayne@68: semi=";"; jpayne@68: } jpayne@68: } jpayne@68: return sb.toString(); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Return a list of TaxIDs of all ancestors. jpayne@68: * @param tn0 Base node jpayne@68: * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. jpayne@68: * @return List of TaxIDs. jpayne@68: */ jpayne@68: public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){ jpayne@68: if(tn0==null){return null;} jpayne@68: IntList list=new IntList(8); jpayne@68: jpayne@68: TaxNode tn=tn0; jpayne@68: while(tn!=null){ jpayne@68: if(!skipNonCanonical || tn.isSimple()){ jpayne@68: if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);} jpayne@68: } jpayne@68: if(tn.pid==tn.id){break;} jpayne@68: tn=getNode(tn.pid); jpayne@68: } jpayne@68: if(list.isEmpty()){list.add(tn0.id);} jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Return a list of all ancestors. jpayne@68: * @param tn0 Base node jpayne@68: * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. jpayne@68: * @return List of ancestor nodes. jpayne@68: */ jpayne@68: public ArrayList toAncestors(final TaxNode tn0, boolean skipNonCanonical){ jpayne@68: if(tn0==null){return null;} jpayne@68: ArrayList list=new ArrayList(8); jpayne@68: jpayne@68: TaxNode tn=tn0; jpayne@68: while(tn!=null){ jpayne@68: if(!skipNonCanonical || tn.isSimple()){ jpayne@68: if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);} jpayne@68: } jpayne@68: if(tn.pid==tn.id){break;} jpayne@68: tn=getNode(tn.pid); jpayne@68: } jpayne@68: if(list.isEmpty()){list.add(tn0);} jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Generate a path to the genome of an organism on the filesystem; jpayne@68: * used by ExplodeTree. Intended for internal JGI use. jpayne@68: * @param root Location of the exploded tree. jpayne@68: * @return Path to a genome. jpayne@68: */ jpayne@68: public String toDir(TaxNode node, String root){ jpayne@68: StringBuilder sb=new StringBuilder(); jpayne@68: if(root==null){root="";} jpayne@68: sb.append(root); jpayne@68: if(root.length()>0 && !root.endsWith("/")){sb.append('/');} jpayne@68: IntList list=toAncestorIds(node, false); jpayne@68: list.reverse(); jpayne@68: assert(list.get(0)==1) : list + "," +getNode(list.get(0)); jpayne@68: for(int i=0; i'){header=header.substring(1);} jpayne@68: if(!header.startsWith("tid|")){return -1;} jpayne@68: int idx=3; jpayne@68: int idx2=header.indexOf('|', 4); jpayne@68: if(idx2<5){return -1;} jpayne@68: int id=-1; jpayne@68: try { jpayne@68: id=Parse.parseInt(header, idx+1, idx2); jpayne@68: // System.err.println("d"+", "+header.substring(idx+1, idx2)); jpayne@68: } catch (Throwable e) { jpayne@68: // System.err.println("e"+", "+header.substring(idx+1, idx2)); jpayne@68: //ignore jpayne@68: } jpayne@68: return id; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Determine the TaxID of a String. jpayne@68: * @param header Typically a sequence header jpayne@68: * @param bestEffort In some cases, try certain substrings if the name is not found. jpayne@68: * @return jpayne@68: */ jpayne@68: public TaxNode parseNodeFromHeader(String header, boolean bestEffort){ jpayne@68: if(header==null || header.length()<2){return null;} jpayne@68: if(header.charAt(0)=='>'){header=header.substring(1);} jpayne@68: TaxNode tn; jpayne@68: if(SILVA_MODE){ jpayne@68: tn=getNodeSilva(header, bestEffort); jpayne@68: }else if(UNITE_MODE){ jpayne@68: tn=getNodeUnite(header, bestEffort); jpayne@68: }else{ jpayne@68: final char delimiter=ncbiHeaderDelimiter(header); jpayne@68: if(delimiter==' '){ jpayne@68: tn=getNodeNewStyle(header); jpayne@68: }else{ jpayne@68: tn=getNodeOldStyle(header, delimiter); jpayne@68: if(tn==null && delimiter=='|'){ jpayne@68: // System.err.println("A: "+header); jpayne@68: int id=-1; jpayne@68: String[] split=header.split("\\|"); jpayne@68: if(AccessionToTaxid.LOADED()){ jpayne@68: for(int i=0; i=0){tn=getNode(id);} jpayne@68: // System.err.println("F: "+tn); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return tn; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Guess the delimiter character in a String; jpayne@68: * typically assumed to be '|', '~', or ' '. jpayne@68: */ jpayne@68: public static char ncbiHeaderDelimiter(String header){ jpayne@68: for(int i=0; i0) : "i="+i+"; malformatted header '"+header+"'"; jpayne@68: return c; jpayne@68: }else if(Character.isWhitespace(c)){ jpayne@68: return ' '; jpayne@68: } jpayne@68: } jpayne@68: return ' '; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Parse a Silva header to a Node. jpayne@68: * @param s Silva header. jpayne@68: * @param bestEffort Try certain substrings if the name is not found. jpayne@68: * @return Node jpayne@68: */ jpayne@68: TaxNode getNodeSilva(String s, boolean bestEffort){ jpayne@68: if(s==null){return null;} jpayne@68: if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ jpayne@68: return getNodeOldStyle(s, s.charAt(3)); jpayne@68: } jpayne@68: String[] split=Tools.semiPattern.split(s); jpayne@68: jpayne@68: int number=-1; jpayne@68: // final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast")); jpayne@68: // if(chloroplast){return null;} jpayne@68: for(int i=split.length-1; number<0 && i>=0; i--){ jpayne@68: String last=split[i]; jpayne@68: int paren=last.indexOf('('); jpayne@68: if(paren>=0){last=last.substring(0, paren);} jpayne@68: last=last.trim(); jpayne@68: jpayne@68: if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){ jpayne@68: number=parseNameToTaxid(last); jpayne@68: } jpayne@68: jpayne@68: if(number>=0){return getNode(number);} jpayne@68: else if(!bestEffort){break;} jpayne@68: } jpayne@68: return null; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Parse a Unite header to a Node. jpayne@68: * @param s Unite header. jpayne@68: * @param bestEffort Try certain substrings if the name is not found. jpayne@68: * @return Node jpayne@68: */ jpayne@68: TaxNode getNodeUnite(String s, boolean bestEffort){ jpayne@68: if(s==null){return null;} jpayne@68: if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ jpayne@68: return getNodeOldStyle(s, s.charAt(3)); jpayne@68: } jpayne@68: String[] split=Tools.pipePattern.split(s); jpayne@68: jpayne@68: int number=-1; jpayne@68: String name=split[0]; jpayne@68: String acc=split[1]; jpayne@68: if(AccessionToTaxid.LOADED() && acc.length()>0){ jpayne@68: number=AccessionToTaxid.get(acc); jpayne@68: } jpayne@68: if(number<1){ jpayne@68: TaxNode tn=getNodeByName(name); jpayne@68: if(tn!=null){number=tn.id;} jpayne@68: } jpayne@68: jpayne@68: if(number>=0){return getNode(number);} jpayne@68: return null; jpayne@68: } jpayne@68: jpayne@68: /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */ jpayne@68: private TaxNode getNodeOldStyle(final String s, char delimiter){ jpayne@68: { jpayne@68: int index=s.indexOf(delimiter); jpayne@68: if(index<0){ jpayne@68: delimiter='~'; jpayne@68: index=s.indexOf(delimiter); jpayne@68: if(index<0){ jpayne@68: delimiter='_'; jpayne@68: index=s.indexOf(delimiter); jpayne@68: } jpayne@68: } jpayne@68: int number=-1; jpayne@68: jpayne@68: Throwable e=null; jpayne@68: jpayne@68: if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){ jpayne@68: // System.err.println("Parsing gi number."); jpayne@68: jpayne@68: if(GiToTaxid.isInitialized()){ jpayne@68: try { jpayne@68: number=GiToTaxid.parseGiToTaxid(s, delimiter); jpayne@68: } catch (Throwable e2) { jpayne@68: e=e2; jpayne@68: } jpayne@68: }else{ jpayne@68: assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s; jpayne@68: } jpayne@68: // if(number!=-1){System.err.println("number="+number);} jpayne@68: }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){ jpayne@68: // System.err.println("Parsing ncbi number."); jpayne@68: number=GiToTaxid.parseTaxidNumber(s, delimiter); jpayne@68: }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){ jpayne@68: // System.err.println("Parsing ncbi number."); jpayne@68: long img=parseDelimitedNumber(s, delimiter); jpayne@68: ImgRecord record=imgMap.get(img); jpayne@68: number=(record==null ? -1 : record.taxID); jpayne@68: }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete jpayne@68: // System.err.println("Parsing ncbi number."); jpayne@68: number=GiToTaxid.parseTaxidNumber(s, delimiter); jpayne@68: } jpayne@68: jpayne@68: if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){ jpayne@68: String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s)); jpayne@68: if(AccessionToTaxid.LOADED()){ jpayne@68: number=parseAccessionToTaxid(split); jpayne@68: } jpayne@68: if(number<0){ jpayne@68: number=parseHeaderNameToTaxid(split); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(number<0 && e!=null){ jpayne@68: assert(false) : e; jpayne@68: throw new RuntimeException(e); jpayne@68: } jpayne@68: jpayne@68: //TaxServer code could go here... jpayne@68: jpayne@68: if(number>=0){return getNode(number);} jpayne@68: } jpayne@68: if(verbose){System.err.println("Can't process name "+s);} jpayne@68: if(Tools.isDigit(s.charAt(0)) && s.length()<=9){ jpayne@68: try { jpayne@68: return getNode(Integer.parseInt(s)); jpayne@68: } catch (NumberFormatException e) { jpayne@68: //ignore jpayne@68: } jpayne@68: } jpayne@68: return null; jpayne@68: } jpayne@68: jpayne@68: /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */ jpayne@68: static long parseDelimitedNumber(String s, char delimiter){ jpayne@68: if(s==null){return -1;} jpayne@68: int i=0; jpayne@68: while(i=s.length() || !Tools.isDigit(s.charAt(i))){return -1;} jpayne@68: jpayne@68: long number=0; jpayne@68: while(i0){ jpayne@68: number=AccessionToTaxid.get(s.substring(0, space)); jpayne@68: }else{ jpayne@68: number=AccessionToTaxid.get(s); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){ jpayne@68: try { jpayne@68: return getNode(Integer.parseInt(s)); jpayne@68: } catch (NumberFormatException e) { jpayne@68: //ignore jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(number<0 && space>0){ jpayne@68: number=parseNameToTaxid(s.substring(space+1)); jpayne@68: } jpayne@68: jpayne@68: if(number>-1){return getNode(number);} jpayne@68: if(space<0 && s.indexOf('_')>0){ jpayne@68: return getNodeNewStyle(s.replace('_', ' ')); jpayne@68: } jpayne@68: return null; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * For parsing old-style NCBI headers. jpayne@68: */ jpayne@68: public int parseAccessionToTaxid(String[] split){ jpayne@68: if(split.length<4){ jpayne@68: return -1; jpayne@68: } jpayne@68: int ncbi=AccessionToTaxid.get(split[3]); jpayne@68: return ncbi; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * For parsing old-style NCBI headers. jpayne@68: */ jpayne@68: public int parseHeaderNameToTaxid(String[] split){ jpayne@68: if(split.length<5){ jpayne@68: return -1; jpayne@68: } jpayne@68: return parseNameToTaxid(split[4]); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens"). jpayne@68: * If multiple nodes share the same name, returns the first; to get the full list, jpayne@68: * use getNodesByNameExtended. jpayne@68: * @param name Organism name. jpayne@68: * @return Organism TaxID, or -1 if not found. jpayne@68: */ jpayne@68: public int parseNameToTaxid(String name){ jpayne@68: // assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size()); jpayne@68: List list=null; jpayne@68: jpayne@68: list=getNodesByNameExtended(name); jpayne@68: jpayne@68: if(list==null || list.size()>1){return -1;} jpayne@68: return list.get(0).id; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch nodes indicated by this name. jpayne@68: * @param name A taxonomic name delimited by space or underscore. jpayne@68: * @return Nodes corresponding to the name. jpayne@68: */ jpayne@68: public List getNodesByNameExtended(String name){ jpayne@68: List list=null; jpayne@68: jpayne@68: list=getNodesByName(name); jpayne@68: if(list!=null){return list;} jpayne@68: jpayne@68: name=name.replaceAll("_", " ").trim(); jpayne@68: list=getNodesByName(name); jpayne@68: if(list!=null){return list;} jpayne@68: jpayne@68: String[] split2=name.split(" "); jpayne@68: jpayne@68: if(split2.length>7){ jpayne@68: String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: jpayne@68: if(split2.length>6){ jpayne@68: String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: jpayne@68: if(split2.length>5){ jpayne@68: String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: if(split2.length>4){ jpayne@68: String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: if(split2.length>3){ jpayne@68: String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: if(split2.length>2){ jpayne@68: String term=split2[0]+" "+split2[1]+" "+split2[2]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: if(split2.length>1){ jpayne@68: String term=split2[0]+" "+split2[1]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: if(split2.length>0){ jpayne@68: String term=split2[0]; jpayne@68: list=getNodesByName(term); jpayne@68: // System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); jpayne@68: if(list!=null){return list;} jpayne@68: } jpayne@68: jpayne@68: return null; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Assorted Methods ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Return the TaxID of the lowest ancestor node at least the specified level, jpayne@68: * including this node itself. Level is the normal (non-extended) level. jpayne@68: * @param taxID jpayne@68: * @param taxLevel jpayne@68: * @return jpayne@68: */ jpayne@68: public int promote(final int taxID, int taxLevel){ jpayne@68: TaxNode tn=null; jpayne@68: tn=(taxID<1 ? null : getNode(taxID)); jpayne@68: tn=promote(tn, taxLevel); jpayne@68: return (tn==null ? taxID : tn.id); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch the first node in this node's lineage of at least the indicated level. jpayne@68: * This can be the node itself or an ancestor. jpayne@68: * @see getNodeAtLevelExtended jpayne@68: * @param tn Node in question jpayne@68: * @param taxLevel Desired minimum level jpayne@68: * @return A node at the desired level jpayne@68: */ jpayne@68: public TaxNode promote(TaxNode tn, int taxLevel){ jpayne@68: while(tn!=null && tn.pid!=tn.id && tn.level=TaxTree.LIFE || temp.level>taxLevel){break;} jpayne@68: tn=temp; jpayne@68: } jpayne@68: return tn; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Determine the TaxID of the node's parent. jpayne@68: * @param id TaxID of child node jpayne@68: * @return Parent TaxID jpayne@68: */ jpayne@68: public int getParentID(int id){ jpayne@68: assert(id=nodes.length){return -1;} jpayne@68: TaxNode tn=nodes[id]; jpayne@68: if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);} jpayne@68: return tn==null ? -1 : tn.pid; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch the node with this TaxID. jpayne@68: * @param id TaxID jpayne@68: * @return Node jpayne@68: */ jpayne@68: public TaxNode getNode(int id){ jpayne@68: assert(id=nodes.length){return null;} jpayne@68: TaxNode tn=nodes[id]; jpayne@68: if(tn!=null || mergedMap==null){return tn;} jpayne@68: return getNode(mergedMap.get(id), true); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch the node with this TaxID, but don't throw assertions upon failure. jpayne@68: * @param id TaxID jpayne@68: * @return Node jpayne@68: */ jpayne@68: public TaxNode getNode(int id, boolean skipAssertion){ jpayne@68: assert(skipAssertion || id=nodes.length){return null;} jpayne@68: TaxNode tn=nodes[id]; jpayne@68: if(tn!=null || mergedMap==null){return tn;} jpayne@68: return getNode(mergedMap.get(id), true); jpayne@68: } jpayne@68: jpayne@68: public TaxNode getNodeAtLevel(int id, int minLevel){ jpayne@68: return getNodeAtLevel(id, minLevel, DOMAIN); jpayne@68: } jpayne@68: jpayne@68: public TaxNode getNodeAtLevelExtended(int id, int minLevelE){ jpayne@68: return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E); jpayne@68: } jpayne@68: jpayne@68: public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){ jpayne@68: final int minLevelExtended=levelToExtended(minLevel); jpayne@68: final int maxLevelExtended=levelToExtended(maxLevel); jpayne@68: return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended); jpayne@68: } jpayne@68: jpayne@68: public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){ jpayne@68: TaxNode tn=getNode(id); jpayne@68: while(tn!=null && tn.pid!=tn.id && tn.levelExtendedmaxLevelE){break;} jpayne@68: tn=temp; jpayne@68: } jpayne@68: return tn; jpayne@68: } jpayne@68: jpayne@68: public int getIdAtLevelExtended(int taxID, int taxLevelExtended){ jpayne@68: if(taxLevelExtended<0){return taxID;} jpayne@68: TaxNode tn=getNode(taxID); jpayne@68: while(tn!=null && tn.id!=tn.pid && tn.levelExtendedtaxLevelExtended){break;} jpayne@68: taxID=tn.id; jpayne@68: } jpayne@68: return taxID; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch the node with this name. jpayne@68: * Throw an assertion if there are multiple such nodes. jpayne@68: * @param s Organism name. jpayne@68: * @return Node with given name. jpayne@68: */ jpayne@68: public TaxNode getNodeByName(String s){ jpayne@68: List list=getNodesByName(s, false); jpayne@68: if(list==null){list=getNodesByName(s, true);} jpayne@68: if(list==null || list.isEmpty()){return null;} jpayne@68: if(list.size()==1){return list.get(0);} jpayne@68: assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n"; jpayne@68: TaxNode a=list.get(0); jpayne@68: for(int i=1; i getNodesByName(String s){ jpayne@68: List list=getNodesByName(s, false); jpayne@68: if(list==null){list=getNodesByName(s, true);} jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Fetch a map of names to nodes. If absent, create it first. jpayne@68: * @param lowercase If true, return the map with lowercase keys. jpayne@68: * @return Map of names to nodes. jpayne@68: */ jpayne@68: private HashMap> getMap(boolean lowercase){ jpayne@68: HashMap> map=(lowercase ? nameMapLower : nameMap); jpayne@68: if(map==null){ jpayne@68: synchronized(this){hashNames(true);} jpayne@68: map=(lowercase ? nameMapLower : nameMap); jpayne@68: } jpayne@68: assert(map!=null) : "Tax names were not hashed."; jpayne@68: return map; jpayne@68: } jpayne@68: jpayne@68: private List getNodesByName(String s, boolean lowercase){ jpayne@68: if(s==null){return null;} jpayne@68: if(s.indexOf('_')>=0){s=s.replace('_', ' ');} jpayne@68: if(lowercase){s=s.toLowerCase();} jpayne@68: // System.err.println("Searching for "+s); jpayne@68: final HashMap> map=getMap(lowercase); jpayne@68: ArrayList list=map.get(s); jpayne@68: if(list!=null){return list;} jpayne@68: // System.err.println("No matches for '"+s+"'"); jpayne@68: jpayne@68: // assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s); jpayne@68: jpayne@68: if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;} jpayne@68: String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8); jpayne@68: // System.err.println("Array: "+Arrays.toString(split)); jpayne@68: list=map.get(split[split.length-1]); jpayne@68: if(list==null){return list;} jpayne@68: // System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")"); jpayne@68: jpayne@68: int matchCount=0; jpayne@68: for(TaxNode tn : list){ jpayne@68: if(tn.matchesName(split, split.length-1, this)){matchCount++;} jpayne@68: } jpayne@68: if(matchCount==list.size()){return list;} jpayne@68: if(matchCount<1){return null;} jpayne@68: ArrayList hits=new ArrayList(matchCount); jpayne@68: for(TaxNode tn : list){ jpayne@68: if(tn.matchesName(split, split.length-1, this)){hits.add(tn);} jpayne@68: } jpayne@68: return hits; jpayne@68: } jpayne@68: public ArrayList getAncestors(int id){ jpayne@68: TaxNode current=getNode(id); jpayne@68: ArrayList list=new ArrayList(); jpayne@68: while(current!=null && current.pid!=current.id){//ignores root jpayne@68: list.add(current); jpayne@68: current=getNode(current.pid); jpayne@68: } jpayne@68: //optionally add root here jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: public void increment(IntList ids, IntList counts, boolean sync){ jpayne@68: jpayne@68: ids.sort(); jpayne@68: ids.getUniqueCounts(counts); jpayne@68: jpayne@68: if(!sync){ jpayne@68: for(int i=0; i=0 && id gatherNodesAtLeastLimit(final long limit){ jpayne@68: return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1); jpayne@68: } jpayne@68: jpayne@68: public ArrayList gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){ jpayne@68: final int minLevelExtended=levelToExtended(minLevel); jpayne@68: final int maxLevelExtended=levelToExtended(maxLevel); jpayne@68: // assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended; jpayne@68: ArrayList list=new ArrayList(); jpayne@68: for(int i=minLevelExtended; i gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){ jpayne@68: ArrayList list=new ArrayList(); jpayne@68: final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended]; jpayne@68: for(final TaxNode n : stratum){ jpayne@68: if(n.countSum>=limit){ jpayne@68: list.add(n); jpayne@68: TaxNode parent=nodes[n.pid]; jpayne@68: if(n!=parent){ jpayne@68: percolateUp(parent, -n.countSum);//123 This was negative for some reason jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: Shared.sort(list, TaxNode.countComparator); jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Static Initializers ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** jpayne@68: * Generate the name to level number map. jpayne@68: */ jpayne@68: private static HashMap makeLevelMap() { jpayne@68: HashMap map=new HashMap(31); jpayne@68: for(int i=0; i makeLevelMapExtended() { jpayne@68: HashMap map=new HashMap(129); jpayne@68: for(int i=0; i makeAltLevelMap() { jpayne@68: HashMap map=new HashMap(129); jpayne@68: for(int i=0; i0 && line[0]!='#'){ jpayne@68: int a=0, b=0; jpayne@68: jpayne@68: while(ba) : "Missing field 0: "+new String(line); jpayne@68: int tid=Parse.parseInt(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 1: "+new String(line); jpayne@68: long size=Parse.parseLong(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 2: "+new String(line); jpayne@68: long csize=Parse.parseLong(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 3: "+new String(line); jpayne@68: int seqs=Parse.parseInt(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 4: "+new String(line); jpayne@68: long cseqs=Parse.parseLong(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 5: "+new String(line); jpayne@68: int cnodes=Parse.parseInt(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);} jpayne@68: if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);} jpayne@68: if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);} jpayne@68: if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);} jpayne@68: if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);} jpayne@68: } jpayne@68: } jpayne@68: bf.close(); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- IMG ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static int imgToTaxid(long img){ jpayne@68: ImgRecord ir=imgMap.get(img); jpayne@68: // assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n"; jpayne@68: return ir==null ? -1 : ir.taxID; jpayne@68: } jpayne@68: jpayne@68: public TaxNode imgToTaxNode(long img){ jpayne@68: int tid=imgToTaxid(img); jpayne@68: return tid<1 ? null : getNode(tid); jpayne@68: } jpayne@68: jpayne@68: // public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){ jpayne@68: // assert(imgMap==null); jpayne@68: // if(fname==null){return 0;} jpayne@68: // ImgRecord2.storeName=storeName; jpayne@68: // if(outstream!=null){System.err.println("Loading IMG.");} jpayne@68: // Timer t=new Timer(outstream, false); jpayne@68: // ImgRecord2[] array=ImgRecord2.toArray(fname); jpayne@68: // int x=loadIMG(array); jpayne@68: // t.stopAndPrint(); jpayne@68: // return x; jpayne@68: // } jpayne@68: jpayne@68: public static int loadIMG(String fname, boolean storeName, PrintStream outstream){ jpayne@68: assert(imgMap==null); jpayne@68: if(fname==null){return 0;} jpayne@68: ImgRecord.storeName=storeName; jpayne@68: if(outstream!=null){System.err.println("Loading IMG.");} jpayne@68: Timer t=new Timer(outstream, false); jpayne@68: ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ); jpayne@68: int x=loadIMG(array); jpayne@68: t.stopAndPrint(); jpayne@68: return x; jpayne@68: } jpayne@68: jpayne@68: public static int loadIMG(ImgRecord[] array){ jpayne@68: assert(imgMap==null); jpayne@68: imgMap=new HashMap((int)(array.length*1.5)); jpayne@68: for(ImgRecord record : array){ jpayne@68: imgMap.put(record.imgID, record); jpayne@68: } jpayne@68: return imgMap.size(); jpayne@68: } jpayne@68: jpayne@68: @Deprecated jpayne@68: public static int parseLevel(String b){ jpayne@68: final int level; jpayne@68: if(b==null){level=-1;} jpayne@68: else if(Tools.isNumeric(b.charAt(0))){ jpayne@68: level=Integer.parseInt(b); jpayne@68: }else{ jpayne@68: level=stringToLevel(b.toLowerCase()); jpayne@68: } jpayne@68: return level; jpayne@68: } jpayne@68: jpayne@68: public static int parseLevelExtended(String b){ jpayne@68: final int level; jpayne@68: if(b==null){level=-1;} jpayne@68: else if(Tools.isNumeric(b.charAt(0))){ jpayne@68: level=levelToExtended(Integer.parseInt(b)); jpayne@68: }else{ jpayne@68: level=stringToLevelExtended(b.toLowerCase()); jpayne@68: } jpayne@68: return level; jpayne@68: } jpayne@68: jpayne@68: public boolean isUnclassified(int tid){ jpayne@68: TaxNode tn=getNode(tid); jpayne@68: while(tn!=null && tn.id!=tn.pid){ jpayne@68: if(tn.isUnclassified()){return true;} jpayne@68: if(tn.pid==tn.id){break;} jpayne@68: tn=getNode(tn.pid); jpayne@68: } jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: public boolean isEnvironmentalSample(int tid){ jpayne@68: TaxNode tn=getNode(tid); jpayne@68: while(tn!=null && tn.id!=tn.pid){ jpayne@68: if(tn.isEnvironmentalSample()){return true;} jpayne@68: if(tn.pid==tn.id){break;} jpayne@68: tn=getNode(tn.pid); jpayne@68: } jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: public boolean isVirus(int tid){ jpayne@68: TaxNode tn=getNode(tid); jpayne@68: while(tn!=null && tn.id!=tn.pid){ jpayne@68: if(tn.id==VIRUSES_ID){return true;} jpayne@68: if(tn.pid==tn.id){break;} jpayne@68: tn=getNode(tn.pid); jpayne@68: } jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: public long definedLevels(int tid){ jpayne@68: long levels=0; jpayne@68: TaxNode tn=getNode(tid); jpayne@68: while(tn!=null && tn.id!=tn.pid){ jpayne@68: levels=levels|(1L< "Sapiens" jpayne@68: * @param tid TaxID jpayne@68: * @return Correct name for this node. jpayne@68: */ jpayne@68: public String mononomial(int tid){return mononomial(getNode(tid));} jpayne@68: public String mononomial(TaxNode tn){ jpayne@68: if(tn==null){return null;} jpayne@68: String name=tn.name; jpayne@68: if(name.indexOf(' ')<0){return name;} jpayne@68: TaxNode parent=getNode(tn.pid); jpayne@68: if(parent==null){return name;} jpayne@68: String pname=parent.name; jpayne@68: if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){ jpayne@68: name=name.substring(pname.length()+1); jpayne@68: } jpayne@68: return name; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** All nodes in the tree in a flat array, indexed by TaxiD */ jpayne@68: public final TaxNode[] nodes; jpayne@68: jpayne@68: /** Number of nodes per normal level */ jpayne@68: public final int[] nodesPerLevel=new int[taxLevelNames.length]; jpayne@68: jpayne@68: /** Number of nodes per extended level */ jpayne@68: public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length]; jpayne@68: jpayne@68: /** Number of nodes in the tree */ jpayne@68: public final int nodeCount; jpayne@68: jpayne@68: /** Maps old TaxIDs to new TaxIDs */ jpayne@68: public final IntHashMap mergedMap; jpayne@68: jpayne@68: /** Arrays of all nodes at a given taxonomic level (extended) */ jpayne@68: public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][]; jpayne@68: jpayne@68: /** Map of names to nodes */ jpayne@68: HashMap> nameMap; jpayne@68: /** Map of lowercase names to nodes */ jpayne@68: HashMap> nameMapLower; jpayne@68: /** Map of nodes to child nodes */ jpayne@68: HashMap> childMap; jpayne@68: public HashMap> nameMap(){return nameMap;} jpayne@68: jpayne@68: @Deprecated jpayne@68: public int minValidTaxa=0; //TODO: Remove (will break serialization) jpayne@68: jpayne@68: /** Infer ranks for no-rank nodes, when possible */ jpayne@68: public boolean simplify=true; jpayne@68: /** See simplify() for details, works in conjunction with simplify */ jpayne@68: public boolean reassign=true; jpayne@68: /** Discard no-rank nodes */ jpayne@68: public boolean skipNorank=false; jpayne@68: public int inferRankLimit=0;//levelMap.get("species"); jpayne@68: jpayne@68: //Node Statistics jpayne@68: /** Number of bases assigned to this TaxID in RefSeq */ jpayne@68: private IntLongHashMap refseqSizeMap; jpayne@68: /** Number of bases assigned to this TaxID and descendants in RefSeq */ jpayne@68: private IntLongHashMap refseqSizeMapC; jpayne@68: /** Number of sequences assigned to this TaxID in RefSeq */ jpayne@68: private IntHashMap refseqSeqMap; jpayne@68: /** Number of sequences assigned to this TaxID and descendants in RefSeq */ jpayne@68: private IntLongHashMap refseqSeqMapC; jpayne@68: /** Number of descendant nodes, inclusive, for each TaxID */ jpayne@68: private IntHashMap nodeMapC; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Statics ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Assign levels to unranked nodes below species level, when possible */ jpayne@68: public static boolean assignStrains=true; jpayne@68: /** Assume headers are in Silva format */ jpayne@68: public static boolean SILVA_MODE=false; jpayne@68: /** Assume headers are in Unite format */ jpayne@68: public static boolean UNITE_MODE=false; jpayne@68: /** Probably unnecessary at this point... present for legacy reasons */ jpayne@68: public static boolean CRASH_IF_NO_GI_TABLE=true; jpayne@68: jpayne@68: public static boolean verbose=false; jpayne@68: public static boolean SHOW_WARNINGS=false; jpayne@68: jpayne@68: /** Maps IMG IDs to records from the dump file */ jpayne@68: private static HashMap imgMap; jpayne@68: jpayne@68: /** Set to false if the tree is expected to be mutated. jpayne@68: * @TODO Remove mutable fields from the tree (like counters). jpayne@68: */ jpayne@68: public static boolean ALLOW_SHARED_TREE=true; jpayne@68: jpayne@68: /** Universal location of the shared TaxTree used by various classes */ jpayne@68: private static TaxTree sharedTree; jpayne@68: jpayne@68: /** A simpler and probably less safe version of sharedTree(...) */ jpayne@68: public static TaxTree getTree(){return sharedTree;} jpayne@68: jpayne@68: /** jpayne@68: * Fetch the shared tree, loading it from file if not present. jpayne@68: * @return A tree. jpayne@68: * @TODO: Check proper-construction of double-checked synchronize jpayne@68: */ jpayne@68: private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) { jpayne@68: if(!ALLOW_SHARED_TREE){return null;} jpayne@68: if(sharedTree==null && fname!=null){ jpayne@68: if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();} jpayne@68: synchronized(TaxTree.class){ jpayne@68: if(sharedTree==null){ jpayne@68: if(outstream!=null){outstream.println("Loading tax tree.");} jpayne@68: Timer t=new Timer(outstream, false); jpayne@68: setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat); jpayne@68: t.stopAndPrint(); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(hashNames && sharedTree.nameMap==null){ jpayne@68: synchronized(sharedTree){ jpayne@68: if(sharedTree.nameMap==null){ jpayne@68: if(outstream!=null){outstream.println("Hashing names.");} jpayne@68: Timer t=new Timer(outstream, false); jpayne@68: sharedTree.hashNames(hashDotFormat); jpayne@68: t.stopAndPrint(); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return sharedTree; jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * For initialization. Normally only one tree is needed by a process so it is set here. jpayne@68: * If the tree is already set nothing will happen, unless additional hashing is needed. jpayne@68: */ jpayne@68: private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){ jpayne@68: assert(ALLOW_SHARED_TREE); jpayne@68: assert(sharedTree==null); jpayne@68: sharedTree=tree; jpayne@68: if(hashNames && sharedTree.nameMap==null){ jpayne@68: synchronized(sharedTree){ jpayne@68: if(sharedTree.nameMap==null){ jpayne@68: sharedTree.hashNames(hashDotFormat); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Determine whether a taxonomic level is standard. e.g.:
jpayne@68: * isSimple("phylum")=true
jpayne@68: * isSimple("subphylum")=false
jpayne@68: * isSimple("no-rank")=false jpayne@68: * @param levelExtended The extended level to test. jpayne@68: * @return True if this level is not no-rank, and the names of the normal and extended levels match. jpayne@68: */ jpayne@68: public static boolean isSimple(int levelExtended){ jpayne@68: int level=extendedToLevel(levelExtended); jpayne@68: return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:
jpayne@68: * isSimple("phylum")=true
jpayne@68: * isSimple("substrain")=true
jpayne@68: * isSimple("subphylum")=false
jpayne@68: * isSimple("no-rank")=false jpayne@68: * @param levelExtended The extended level to test. jpayne@68: * @return True if this level is not no-rank, and the names of the normal and extended levels match. jpayne@68: */ jpayne@68: public static boolean isSimple2(int levelExtended){ jpayne@68: int level=extendedToLevel(levelExtended); jpayne@68: return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level) jpayne@68: || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Constants ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Get the number for the normal level of this name */ jpayne@68: public static final int stringToLevel(String s){return altLevelMap.get(s);} jpayne@68: public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);} jpayne@68: /** Get the number for the extended level of this name */ jpayne@68: public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);} jpayne@68: /** Get the normal name for this normal level */ jpayne@68: public static final String levelToString(int x){return taxLevelNames[x];} jpayne@68: /** Get the extended name for this extended level */ jpayne@68: public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];} jpayne@68: /** Get the abbreviated name for this normal level */ jpayne@68: public static final String levelToStringShort(int x){return taxLevelNamesShort[x];} jpayne@68: jpayne@68: /** Normal, aka canonical, aka simple tax level names */ jpayne@68: private static final String[] taxLevelNames=new String[] { jpayne@68: "no rank", "subspecies", "species", "genus", jpayne@68: "family", "order", "class", "phylum", jpayne@68: "kingdom", "superkingdom", "domain", "life" jpayne@68: }; jpayne@68: public static final int numTaxLevelNames=taxLevelNames.length; jpayne@68: jpayne@68: /** jpayne@68: * Definitive representation of all NCBI taxonomic level names. jpayne@68: * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash. jpayne@68: * The first dimension maps normal ranks to extended ranks. jpayne@68: * Both dimensions are ordered ascending. jpayne@68: * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels(). jpayne@68: */ jpayne@68: //TODO See @TODO jpayne@68: private static final String[][] taxLevelNamesExtendedMatrix=new String[][] { jpayne@68: {"no rank"}, jpayne@68: {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup", jpayne@68: "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas", jpayne@68: "subspecies"}, jpayne@68: {"species"}, jpayne@68: {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"}, jpayne@68: {"subtribe", "tribe", "subfamily", "family"}, jpayne@68: {"superfamily", "parvorder", "infraorder", "suborder", "order"}, jpayne@68: {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"}, jpayne@68: {"superclass", "subdivision", "division", "subphylum", "phylum"}, jpayne@68: {"superphylum", "subkingdom", "kingdom"}, jpayne@68: {"superkingdom"}, jpayne@68: {"domain"}, jpayne@68: {"life"} jpayne@68: }; jpayne@68: jpayne@68: /** Extended tax level names as a 1D array */ jpayne@68: private static final String[] taxLevelNamesExtended=makeNamesExtended(); jpayne@68: /** Number of extended tax levels */ jpayne@68: public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length; jpayne@68: jpayne@68: /** Flatten the extended tax level names matrix to a 1D array */ jpayne@68: private static final String[] makeNamesExtended(){ jpayne@68: ArrayList list=new ArrayList(); jpayne@68: for(String[] s : taxLevelNamesExtendedMatrix){ jpayne@68: for(String ss : s){ jpayne@68: list.add(ss); jpayne@68: } jpayne@68: } jpayne@68: return list.toArray(new String[0]); jpayne@68: } jpayne@68: jpayne@68: /** Abbreviations of tax level names, mainly for semicolon form */ jpayne@68: private static final String[] taxLevelNamesShort=new String[] { jpayne@68: "nr", "ss", "s", "g", jpayne@68: "f", "o", "c", "p", jpayne@68: "k", "sk", "d", "l" jpayne@68: }; jpayne@68: jpayne@68: /** Normal tax level numbers as constants */ jpayne@68: public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3, jpayne@68: FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11; jpayne@68: jpayne@68: /** TaxID of Life node */ jpayne@68: public static final int LIFE_ID=1; jpayne@68: /** TaxID of Cellular Organisms node */ jpayne@68: public static final int CELLULAR_ORGANISMS_ID=131567; jpayne@68: /** TaxID of Bacteria node */ jpayne@68: public static final int BACTERIA_ID=2; //Is this safe? Who knows... jpayne@68: /** TaxID of Archaea node */ jpayne@68: public static final int ARCHAEA_ID=2157; jpayne@68: /** TaxID of Euk node */ jpayne@68: public static final int EUKARYOTA_ID=2759; jpayne@68: /** TaxID of Animal node */ jpayne@68: public static final int METAZOA_ID=33208, ANIMALIA_ID=33208; jpayne@68: /** TaxID of Plant node */ jpayne@68: public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090; jpayne@68: /** TaxID of Fungi node */ jpayne@68: public static final int FUNGI_ID=4751; jpayne@68: /** TaxID of Virus node */ jpayne@68: public static final int VIRUSES_ID=10239; jpayne@68: /** TaxID of Viroids node (now defunct) */ jpayne@68: public static final int VIROIDS_ID=12884; jpayne@68: jpayne@68: /** Maps normal level names to normal level numbers */ jpayne@68: private static final HashMap levelMap=makeLevelMap(); jpayne@68: /** Maps extended level names to extended level numbers */ jpayne@68: private static final HashMap levelMapExtended=makeLevelMapExtended(); jpayne@68: /** Maps extended level names to normal level numbers */ jpayne@68: private static final HashMap altLevelMap=makeAltLevelMap(); jpayne@68: jpayne@68: /** Common extended level numbers as constants */ jpayne@68: public static final int NO_RANK_E=NO_RANK, jpayne@68: SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"), jpayne@68: SUBSPECIES_E=stringToLevelExtended("subspecies"), jpayne@68: SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"), jpayne@68: FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"), jpayne@68: CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"), jpayne@68: KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"), jpayne@68: DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life"); jpayne@68: jpayne@68: /** Map of normal to extended level numbers */ jpayne@68: private static final int[] levelToExtended=new int[] { jpayne@68: NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E, jpayne@68: ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E jpayne@68: }; jpayne@68: jpayne@68: /** Map of extended to normal level numbers */ jpayne@68: private static final int[] extendedToLevel=makeExtendedToLevel(); jpayne@68: jpayne@68: /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */ jpayne@68: private static int[] makeExtendedToLevel(){ jpayne@68: int len=0; jpayne@68: for(String[] array : taxLevelNamesExtendedMatrix){ jpayne@68: len+=array.length; jpayne@68: } jpayne@68: int[] ret=new int[len]; jpayne@68: jpayne@68: int pos=0; jpayne@68: for(int level=0; level phylum */ jpayne@68: public static final int extendedToLevel(int extended){ jpayne@68: return extended<0 ? -1 : extendedToLevel[extended]; jpayne@68: } jpayne@68: jpayne@68: /* Pre-compiled delimiters to save time when splitting lines */ jpayne@68: private static final Pattern delimiterTab = Pattern.compile("\t"); jpayne@68: private static final Pattern delimiter = Pattern.compile("\t\\|\t"); jpayne@68: private static final Pattern delimiterPipe = Pattern.compile("\\|"); jpayne@68: private static final Pattern delimiterTilde = Pattern.compile("\\~"); jpayne@68: private static final Pattern delimiter2 = Pattern.compile("[\\s_]+"); jpayne@68: jpayne@68: public static boolean IMG_HQ=false; jpayne@68: jpayne@68: /* For these fields, see the corresponding functions, below. jpayne@68: * They define the default paths to various data on NERSC. */ jpayne@68: jpayne@68: private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest"; jpayne@68: private static final String defaultTaxPathAws="/test1/tax/latest"; jpayne@68: private static final String defaultTaxPathIGBVM="/data/tax/latest"; jpayne@68: private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; jpayne@68: private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; jpayne@68: private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; jpayne@68: private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz"; jpayne@68: private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz"; jpayne@68: private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz"; jpayne@68: jpayne@68: private static final String defaultImgFile="TAX_PATH/imgDump.txt"; jpayne@68: private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz"; jpayne@68: private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz"; jpayne@68: private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz"; jpayne@68: private static final String defaultPatternFile="TAX_PATH/patterns.txt"; jpayne@68: private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz"; jpayne@68: jpayne@68: private static final String defaultAccessionFile= jpayne@68: //"TAX_PATH/shrunk.protF.accession2taxid.gz," + jpayne@68: "TAX_PATH/shrunk.prot.accession2taxid.gz," jpayne@68: + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz," jpayne@68: + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz," jpayne@68: + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz," jpayne@68: // + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz," jpayne@68: + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz," jpayne@68: // + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz," jpayne@68: + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz," jpayne@68: + "TAX_PATH/shrunk.pdb.accession2taxid.gz"; jpayne@68: jpayne@68: /** For setting TAX_PATH, the root to taxonomy files */ jpayne@68: public static final String defaultTaxPath(){ jpayne@68: return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc; jpayne@68: } jpayne@68: jpayne@68: /** 16S consensus sequences per TaxID */ jpayne@68: public static final String default16SFile(){ jpayne@68: return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc; jpayne@68: } jpayne@68: jpayne@68: /** 18S consensus sequences per TaxID */ jpayne@68: public static final String default18SFile(){ jpayne@68: return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc; jpayne@68: } jpayne@68: jpayne@68: /** Path to all taxonomy files, substituted in to make specific file paths */ jpayne@68: public static String TAX_PATH=defaultTaxPath(); jpayne@68: jpayne@68: /** Location of gitable.int2d.gz for gi lookups */ jpayne@68: public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);} jpayne@68: /** Location of tree.taxtree.gz */ jpayne@68: public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);} jpayne@68: jpayne@68: //Use the prot.FULL.gz ncbi file. jpayne@68: public static boolean protFull=false; jpayne@68: jpayne@68: /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */ jpayne@68: public static final String defaultAccessionFile(){ jpayne@68: String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile; jpayne@68: return s.replaceAll("TAX_PATH", TAX_PATH); jpayne@68: } jpayne@68: /** Location of patterns.txt, which holds information about observed accession string formats */ jpayne@68: public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);} jpayne@68: /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */ jpayne@68: public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);} jpayne@68: /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */ jpayne@68: public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);} jpayne@68: jpayne@68: /** Screen output gets printed here */ jpayne@68: private static PrintStream outstream=System.out; jpayne@68: jpayne@68: }