jpayne@68
|
1 package tax;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.PrintStream;
|
jpayne@68
|
4 import java.io.Serializable;
|
jpayne@68
|
5 import java.util.ArrayList;
|
jpayne@68
|
6 import java.util.Arrays;
|
jpayne@68
|
7 import java.util.HashMap;
|
jpayne@68
|
8 import java.util.LinkedHashMap;
|
jpayne@68
|
9 import java.util.List;
|
jpayne@68
|
10 import java.util.regex.Pattern;
|
jpayne@68
|
11
|
jpayne@68
|
12 import fileIO.ByteFile;
|
jpayne@68
|
13 import fileIO.ReadWrite;
|
jpayne@68
|
14 import fileIO.TextFile;
|
jpayne@68
|
15 import shared.Parse;
|
jpayne@68
|
16 import shared.Parser;
|
jpayne@68
|
17 import shared.PreParser;
|
jpayne@68
|
18 import shared.Shared;
|
jpayne@68
|
19 import shared.Timer;
|
jpayne@68
|
20 import shared.Tools;
|
jpayne@68
|
21 import structures.ByteBuilder;
|
jpayne@68
|
22 import structures.IntHashMap;
|
jpayne@68
|
23 import structures.IntList;
|
jpayne@68
|
24 import structures.IntLongHashMap;
|
jpayne@68
|
25
|
jpayne@68
|
26 /**
|
jpayne@68
|
27 * Represents a taxonomic tree.
|
jpayne@68
|
28 * Usually just one of these needs to be created for a process.
|
jpayne@68
|
29 * Designed for NCBI's taxdmp.zip file contents.
|
jpayne@68
|
30 * @author Brian Bushnell
|
jpayne@68
|
31 * @date Mar 6, 2015
|
jpayne@68
|
32 *
|
jpayne@68
|
33 */
|
jpayne@68
|
34 public class TaxTree implements Serializable{
|
jpayne@68
|
35
|
jpayne@68
|
36 /**
|
jpayne@68
|
37 *
|
jpayne@68
|
38 */
|
jpayne@68
|
39 private static final long serialVersionUID = 5894416521711540017L;
|
jpayne@68
|
40
|
jpayne@68
|
41 /*--------------------------------------------------------------*/
|
jpayne@68
|
42 /*---------------- Main ----------------*/
|
jpayne@68
|
43 /*--------------------------------------------------------------*/
|
jpayne@68
|
44
|
jpayne@68
|
45 /**
|
jpayne@68
|
46 * Code entrance from the command line.
|
jpayne@68
|
47 * This is not called normally, only when converting NCBI text files
|
jpayne@68
|
48 * into a binary representation and writing it to disk.
|
jpayne@68
|
49 * @param args Command line arguments
|
jpayne@68
|
50 */
|
jpayne@68
|
51 public static void main(String[] args){
|
jpayne@68
|
52
|
jpayne@68
|
53 {//Preparse block for help, config files, and outstream
|
jpayne@68
|
54 PreParser pp=new PreParser(args, outstream, null, false);
|
jpayne@68
|
55 args=pp.args;
|
jpayne@68
|
56 outstream=pp.outstream;
|
jpayne@68
|
57 }
|
jpayne@68
|
58
|
jpayne@68
|
59 assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n";
|
jpayne@68
|
60 ReadWrite.USE_UNPIGZ=true;
|
jpayne@68
|
61 ReadWrite.USE_PIGZ=true;
|
jpayne@68
|
62 ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9);
|
jpayne@68
|
63 ReadWrite.PIGZ_BLOCKSIZE=256;
|
jpayne@68
|
64 ReadWrite.PIGZ_ITERATIONS=60;
|
jpayne@68
|
65
|
jpayne@68
|
66 Timer t=new Timer();
|
jpayne@68
|
67 TaxTree tree=new TaxTree(args);
|
jpayne@68
|
68 t.stop();
|
jpayne@68
|
69
|
jpayne@68
|
70 outstream.println("Retained "+tree.nodeCount+" nodes:");
|
jpayne@68
|
71
|
jpayne@68
|
72 for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){
|
jpayne@68
|
73 outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]);
|
jpayne@68
|
74 if(verbose){
|
jpayne@68
|
75 int lim=10;
|
jpayne@68
|
76 for(int j=0; j<lim && j<tree.treeLevelsExtended[i].length; j++){
|
jpayne@68
|
77 TaxNode n=tree.treeLevelsExtended[i][j];
|
jpayne@68
|
78 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]);
|
jpayne@68
|
79 }
|
jpayne@68
|
80 for(int j=tree.treeLevelsExtended[i].length-lim; j<tree.treeLevelsExtended[i].length; j++){
|
jpayne@68
|
81 if(j>=lim){
|
jpayne@68
|
82 TaxNode n=tree.treeLevelsExtended[i][j];
|
jpayne@68
|
83 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]);
|
jpayne@68
|
84 }
|
jpayne@68
|
85 }
|
jpayne@68
|
86 }
|
jpayne@68
|
87 outstream.println();
|
jpayne@68
|
88 }
|
jpayne@68
|
89
|
jpayne@68
|
90
|
jpayne@68
|
91 outstream.println();
|
jpayne@68
|
92 outstream.println("Time: \t"+t);
|
jpayne@68
|
93
|
jpayne@68
|
94 if(args.length>2){//Write a tree
|
jpayne@68
|
95 ReadWrite.write(tree, args[3], true);
|
jpayne@68
|
96 }
|
jpayne@68
|
97 }
|
jpayne@68
|
98
|
jpayne@68
|
99 /** Parse arguments from the command line */
|
jpayne@68
|
100 private Parser parse(String[] args){
|
jpayne@68
|
101
|
jpayne@68
|
102 //Create a parser object
|
jpayne@68
|
103 Parser parser=new Parser();
|
jpayne@68
|
104
|
jpayne@68
|
105 //Set any necessary Parser defaults here
|
jpayne@68
|
106 //parser.foo=bar;
|
jpayne@68
|
107
|
jpayne@68
|
108 //Parse each argument
|
jpayne@68
|
109 for(int i=0; i<args.length; i++){
|
jpayne@68
|
110 String arg=args[i];
|
jpayne@68
|
111
|
jpayne@68
|
112 //Break arguments into their constituent parts, in the form of "a=b"
|
jpayne@68
|
113 String[] split=arg.split("=");
|
jpayne@68
|
114 String a=split[0].toLowerCase();
|
jpayne@68
|
115 String b=split.length>1 ? split[1] : null;
|
jpayne@68
|
116 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
|
jpayne@68
|
117
|
jpayne@68
|
118 if(a.equals("verbose")){
|
jpayne@68
|
119 verbose=Parse.parseBoolean(b);
|
jpayne@68
|
120 }else if(a.equals("parse_flag_goes_here")){
|
jpayne@68
|
121 long fake_variable=Parse.parseKMG(b);
|
jpayne@68
|
122 //Set a variable here
|
jpayne@68
|
123 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
|
jpayne@68
|
124 //do nothing
|
jpayne@68
|
125 }else if(i>3){
|
jpayne@68
|
126 outstream.println("Unknown parameter "+args[i]);
|
jpayne@68
|
127 assert(false) : "Unknown parameter "+args[i];
|
jpayne@68
|
128 }
|
jpayne@68
|
129 }
|
jpayne@68
|
130
|
jpayne@68
|
131 return parser;
|
jpayne@68
|
132 }
|
jpayne@68
|
133
|
jpayne@68
|
134 /*--------------------------------------------------------------*/
|
jpayne@68
|
135 /*---------------- Initialization ----------------*/
|
jpayne@68
|
136 /*--------------------------------------------------------------*/
|
jpayne@68
|
137
|
jpayne@68
|
138 /*--------------------------------------------------------------*/
|
jpayne@68
|
139 /*---------------- Constructors ----------------*/
|
jpayne@68
|
140 /*--------------------------------------------------------------*/
|
jpayne@68
|
141
|
jpayne@68
|
142 /**
|
jpayne@68
|
143 * Constructor using filenames from command line arguments, in the format of:
|
jpayne@68
|
144 * {names, nodes, merged}
|
jpayne@68
|
145 * @param args Command line arguments
|
jpayne@68
|
146 */
|
jpayne@68
|
147 private TaxTree(String[] args){
|
jpayne@68
|
148 this(args[0], args[1], args[2], args);
|
jpayne@68
|
149 }
|
jpayne@68
|
150
|
jpayne@68
|
151 /**
|
jpayne@68
|
152 * @param namesFile NCBI names.txt
|
jpayne@68
|
153 * @param nodesFile NCBI nodes.txt
|
jpayne@68
|
154 * @param mergedFile NCBI merged.txt
|
jpayne@68
|
155 * @param args
|
jpayne@68
|
156 */
|
jpayne@68
|
157 private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){
|
jpayne@68
|
158
|
jpayne@68
|
159 if(args!=null) {
|
jpayne@68
|
160 Parser parser=parse(args);
|
jpayne@68
|
161 }
|
jpayne@68
|
162
|
jpayne@68
|
163 nodes=getNames(namesFile);
|
jpayne@68
|
164 getNodes(nodesFile, nodes);
|
jpayne@68
|
165
|
jpayne@68
|
166 mergedMap=getMerged(mergedFile);
|
jpayne@68
|
167
|
jpayne@68
|
168 countChildren();
|
jpayne@68
|
169 outstream.println("Counted children.");
|
jpayne@68
|
170 int rounds=percolate();
|
jpayne@68
|
171 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
|
jpayne@68
|
172
|
jpayne@68
|
173 if(assignStrains){
|
jpayne@68
|
174 assignStrains();
|
jpayne@68
|
175 rounds=percolate();
|
jpayne@68
|
176 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
|
jpayne@68
|
177 }
|
jpayne@68
|
178
|
jpayne@68
|
179 if(simplify){
|
jpayne@68
|
180 if(verbose){outstream.println("Simplifying.");}
|
jpayne@68
|
181 int removed=simplify(nodes);
|
jpayne@68
|
182 if(verbose){outstream.println("Removed "+removed+" nodes.");}
|
jpayne@68
|
183 rounds=percolate();
|
jpayne@68
|
184 outstream.println("Percolated "+rounds+" rounds to fixpoint.");
|
jpayne@68
|
185 }
|
jpayne@68
|
186 int errors=test(nodes);
|
jpayne@68
|
187 // assert(errors==0); //Not possible since the tree is wrong.
|
jpayne@68
|
188 if(errors>0) {
|
jpayne@68
|
189 System.err.println("Found "+errors+" errors in tree.");
|
jpayne@68
|
190 }
|
jpayne@68
|
191
|
jpayne@68
|
192 for(TaxNode n : nodes){
|
jpayne@68
|
193 if(n!=null){
|
jpayne@68
|
194 nodesPerLevel[n.level]++;
|
jpayne@68
|
195 nodesPerLevelExtended[n.levelExtended]++;
|
jpayne@68
|
196 }
|
jpayne@68
|
197 }
|
jpayne@68
|
198
|
jpayne@68
|
199 // for(int i=0; i<nodesPerLevel.length; i++){
|
jpayne@68
|
200 // treeLevels[i]=new TaxNode[nodesPerLevel[i]];
|
jpayne@68
|
201 // }
|
jpayne@68
|
202 for(int i=0; i<nodesPerLevelExtended.length; i++){
|
jpayne@68
|
203 treeLevelsExtended[i]=new TaxNode[nodesPerLevelExtended[i]];
|
jpayne@68
|
204 }
|
jpayne@68
|
205
|
jpayne@68
|
206 // {
|
jpayne@68
|
207 // int[] temp=new int[nodesPerLevel.length];
|
jpayne@68
|
208 // for(TaxNode n : nodes){
|
jpayne@68
|
209 // if(n!=null){
|
jpayne@68
|
210 // int level=n.level;
|
jpayne@68
|
211 // treeLevels[level][temp[level]]=n;
|
jpayne@68
|
212 // temp[level]++;
|
jpayne@68
|
213 // }
|
jpayne@68
|
214 // }
|
jpayne@68
|
215 // }
|
jpayne@68
|
216
|
jpayne@68
|
217 {
|
jpayne@68
|
218 int[] temp=new int[nodesPerLevelExtended.length];
|
jpayne@68
|
219 for(TaxNode n : nodes){
|
jpayne@68
|
220 if(n!=null){
|
jpayne@68
|
221 int level=n.levelExtended;
|
jpayne@68
|
222 treeLevelsExtended[level][temp[level]]=n;
|
jpayne@68
|
223 temp[level]++;
|
jpayne@68
|
224 }
|
jpayne@68
|
225 }
|
jpayne@68
|
226 }
|
jpayne@68
|
227 nodeCount=(int)Tools.sum(nodesPerLevelExtended);
|
jpayne@68
|
228
|
jpayne@68
|
229 }
|
jpayne@68
|
230
|
jpayne@68
|
231 /*--------------------------------------------------------------*/
|
jpayne@68
|
232 /*--------------- Loaders ----------------*/
|
jpayne@68
|
233 /*--------------------------------------------------------------*/
|
jpayne@68
|
234
|
jpayne@68
|
235
|
jpayne@68
|
236 /**
|
jpayne@68
|
237 * Load a tax tree from disk.
|
jpayne@68
|
238 * @param taxTreeFile Serialized TaxTree.
|
jpayne@68
|
239 * @param hashNames Hash nodes using names as keys
|
jpayne@68
|
240 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens
|
jpayne@68
|
241 * @return
|
jpayne@68
|
242 */
|
jpayne@68
|
243 public static final TaxTree loadTaxTree(String taxTreeFile, PrintStream outstream, boolean hashNames,
|
jpayne@68
|
244 boolean hashDotFormat){
|
jpayne@68
|
245 if(taxTreeFile==null){return null;}
|
jpayne@68
|
246 return loadTaxTree(taxTreeFile, null, null, null, outstream, hashNames, hashDotFormat);
|
jpayne@68
|
247 }
|
jpayne@68
|
248
|
jpayne@68
|
249 /**
|
jpayne@68
|
250 * Load a tax tree from disk, either from a binary tree file,
|
jpayne@68
|
251 * or from NCBI text files.
|
jpayne@68
|
252 * @param taxTreeFile Binary representation; mutually exclusive with other files.
|
jpayne@68
|
253 * @param taxNameFile NCBI names.txt
|
jpayne@68
|
254 * @param taxNodeFile NCBI nodes.txt
|
jpayne@68
|
255 * @param taxMergedFile NCBI merged.txt
|
jpayne@68
|
256 * @param hashNames Hash nodes using names as keys
|
jpayne@68
|
257 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens
|
jpayne@68
|
258 * @return The loaded tree
|
jpayne@68
|
259 */
|
jpayne@68
|
260 public static final TaxTree loadTaxTree(String taxTreeFile, String taxNameFile, String taxNodeFile,
|
jpayne@68
|
261 String taxMergedFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){
|
jpayne@68
|
262 if(taxTreeFile!=null || taxNodeFile==null){
|
jpayne@68
|
263 TaxTree tree=sharedTree(taxTreeFile, hashNames, hashDotFormat, outstream);
|
jpayne@68
|
264 if(tree!=null){return tree;}
|
jpayne@68
|
265 }
|
jpayne@68
|
266 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=defaultTreeFile();}
|
jpayne@68
|
267 assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files.";
|
jpayne@68
|
268 Timer t=new Timer();
|
jpayne@68
|
269 if(outstream!=null){outstream.print("\nLoading tax tree; ");}
|
jpayne@68
|
270 final TaxTree tree;
|
jpayne@68
|
271 if(taxTreeFile!=null){
|
jpayne@68
|
272 tree=ReadWrite.read(TaxTree.class, taxTreeFile, true);
|
jpayne@68
|
273 }else{
|
jpayne@68
|
274 tree=new TaxTree(taxNameFile, taxNodeFile, taxMergedFile, null);
|
jpayne@68
|
275 }
|
jpayne@68
|
276 t.stop();
|
jpayne@68
|
277 if(hashNames){
|
jpayne@68
|
278 if(outstream!=null){outstream.println("Hashing names.");}
|
jpayne@68
|
279 tree.hashNames(hashDotFormat);
|
jpayne@68
|
280 }
|
jpayne@68
|
281 if(outstream!=null){
|
jpayne@68
|
282 outstream.println("Time: \t"+t);
|
jpayne@68
|
283 Shared.printMemory();
|
jpayne@68
|
284 outstream.println();
|
jpayne@68
|
285 }
|
jpayne@68
|
286 if(ALLOW_SHARED_TREE){sharedTree=tree;}
|
jpayne@68
|
287 return tree;
|
jpayne@68
|
288 }
|
jpayne@68
|
289
|
jpayne@68
|
290 /*--------------------------------------------------------------*/
|
jpayne@68
|
291 /*--------------- Constructor Helpers ----------------*/
|
jpayne@68
|
292 /*--------------------------------------------------------------*/
|
jpayne@68
|
293
|
jpayne@68
|
294 /**
|
jpayne@68
|
295 * Finds unranked nodes in the archaeal and bacterial kingdoms.
|
jpayne@68
|
296 * If these are below species level, have a ranked parent,
|
jpayne@68
|
297 * and have no ranked children, they are assigned strain or substrain.
|
jpayne@68
|
298 */
|
jpayne@68
|
299 private void assignStrains(){
|
jpayne@68
|
300
|
jpayne@68
|
301 outstream.println("Assigning strains.");
|
jpayne@68
|
302 int strains=0, substrains=0;
|
jpayne@68
|
303 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed yet
|
jpayne@68
|
304 TaxNode archaea=getNode(ARCHAEA_ID);
|
jpayne@68
|
305 assert(bacteria.name.equalsIgnoreCase("Bacteria"));
|
jpayne@68
|
306 assert(archaea.name.equalsIgnoreCase("Archaea"));
|
jpayne@68
|
307
|
jpayne@68
|
308 ArrayList<TaxNode> bactList=new ArrayList<TaxNode>();
|
jpayne@68
|
309 ArrayList<TaxNode> archList=new ArrayList<TaxNode>();
|
jpayne@68
|
310 for(TaxNode tn : nodes){
|
jpayne@68
|
311 if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){
|
jpayne@68
|
312 if(descendsFrom(tn, bacteria)){
|
jpayne@68
|
313 bactList.add(tn);
|
jpayne@68
|
314 }else if(descendsFrom(tn, archaea)){
|
jpayne@68
|
315 archList.add(tn);
|
jpayne@68
|
316 }
|
jpayne@68
|
317 }
|
jpayne@68
|
318 }
|
jpayne@68
|
319
|
jpayne@68
|
320 ArrayList<TaxNode> prokList=new ArrayList<TaxNode>(bactList.size()+archList.size());
|
jpayne@68
|
321 prokList.addAll(bactList);
|
jpayne@68
|
322 prokList.addAll(archList);
|
jpayne@68
|
323
|
jpayne@68
|
324 for(TaxNode tn : prokList){
|
jpayne@68
|
325 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){
|
jpayne@68
|
326 TaxNode parent=nodes[tn.pid];
|
jpayne@68
|
327 if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){
|
jpayne@68
|
328 tn.levelExtended=STRAIN_E;
|
jpayne@68
|
329 tn.level=SUBSPECIES;
|
jpayne@68
|
330 tn.setOriginalLevel(STRAIN_E);
|
jpayne@68
|
331 strains++;
|
jpayne@68
|
332 }
|
jpayne@68
|
333 }
|
jpayne@68
|
334 }
|
jpayne@68
|
335
|
jpayne@68
|
336 // outstream.println("Assigned "+strains+" strains.");
|
jpayne@68
|
337 for(TaxNode tn : prokList){
|
jpayne@68
|
338 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){
|
jpayne@68
|
339 TaxNode parent=nodes[tn.pid];
|
jpayne@68
|
340 if(parent.levelExtended==STRAIN_E){
|
jpayne@68
|
341 tn.levelExtended=SUBSTRAIN_E;
|
jpayne@68
|
342 tn.level=SUBSPECIES;
|
jpayne@68
|
343 tn.setOriginalLevel(SUBSTRAIN_E);
|
jpayne@68
|
344 substrains++;
|
jpayne@68
|
345 }
|
jpayne@68
|
346 }
|
jpayne@68
|
347 }
|
jpayne@68
|
348 // outstream.println("Assigned "+substrains+" substrains.");
|
jpayne@68
|
349 }
|
jpayne@68
|
350
|
jpayne@68
|
351 @Deprecated
|
jpayne@68
|
352 private void assignStrainsOld(){
|
jpayne@68
|
353
|
jpayne@68
|
354 outstream.println("Assigning strains.");
|
jpayne@68
|
355 int strains=0, substrains=0;
|
jpayne@68
|
356 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed
|
jpayne@68
|
357 assert(bacteria.name.equalsIgnoreCase("Bacteria"));
|
jpayne@68
|
358 for(TaxNode tn : nodes){
|
jpayne@68
|
359 if(tn!=null && tn.originalLevel()==NO_RANK){
|
jpayne@68
|
360 TaxNode parent=nodes[tn.pid];
|
jpayne@68
|
361 if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){
|
jpayne@68
|
362 // nodesPerLevelExtended[STRAIN_E]++;
|
jpayne@68
|
363 // nodesPerLevelExtended[tn.levelExtended]--;
|
jpayne@68
|
364 tn.levelExtended=STRAIN_E;
|
jpayne@68
|
365 tn.level=SUBSPECIES;
|
jpayne@68
|
366 tn.setOriginalLevel(STRAIN_E);
|
jpayne@68
|
367 strains++;
|
jpayne@68
|
368 }
|
jpayne@68
|
369 }
|
jpayne@68
|
370 }
|
jpayne@68
|
371 // outstream.println("Assigned "+strains+" strains.");
|
jpayne@68
|
372 for(TaxNode tn : nodes){
|
jpayne@68
|
373 if(tn!=null && tn.originalLevel()==NO_RANK){
|
jpayne@68
|
374 TaxNode parent=nodes[tn.pid];
|
jpayne@68
|
375 if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){
|
jpayne@68
|
376 // nodesPerLevelExtended[SUBSTRAIN_E]++;
|
jpayne@68
|
377 // nodesPerLevelExtended[tn.levelExtended]--;
|
jpayne@68
|
378 tn.levelExtended=SUBSTRAIN_E;
|
jpayne@68
|
379 tn.level=SUBSPECIES;
|
jpayne@68
|
380 tn.setOriginalLevel(SUBSTRAIN_E);
|
jpayne@68
|
381 substrains++;
|
jpayne@68
|
382 }
|
jpayne@68
|
383 }
|
jpayne@68
|
384 }
|
jpayne@68
|
385 // outstream.println("Assigned "+substrains+" substrains.");
|
jpayne@68
|
386 }
|
jpayne@68
|
387
|
jpayne@68
|
388 /*--------------------------------------------------------------*/
|
jpayne@68
|
389 /*---------------- Construction ----------------*/
|
jpayne@68
|
390 /*--------------------------------------------------------------*/
|
jpayne@68
|
391
|
jpayne@68
|
392 /**
|
jpayne@68
|
393 * Create tax nodes using names in the designated file.
|
jpayne@68
|
394 * @param fname NCBI names.txt
|
jpayne@68
|
395 * @return Array of created nodes, where array[x] contains the node with TaxID x.
|
jpayne@68
|
396 */
|
jpayne@68
|
397 private static TaxNode[] getNames(String fname){
|
jpayne@68
|
398 ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000);
|
jpayne@68
|
399 int max=0;
|
jpayne@68
|
400
|
jpayne@68
|
401 TextFile tf=new TextFile(fname, false);
|
jpayne@68
|
402 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
|
jpayne@68
|
403 if(s.contains("scientific name")){
|
jpayne@68
|
404 String[] split=delimiter.split(s, 3);
|
jpayne@68
|
405 assert(split.length==3) : s;
|
jpayne@68
|
406 int id=Integer.parseInt(split[0]);
|
jpayne@68
|
407 String name=split[1];
|
jpayne@68
|
408 if(id==1 && name.equalsIgnoreCase("root")){name="Life";}
|
jpayne@68
|
409 max=Tools.max(max, id);
|
jpayne@68
|
410 list.add(new TaxNode(id, name));
|
jpayne@68
|
411 }
|
jpayne@68
|
412 }
|
jpayne@68
|
413
|
jpayne@68
|
414 TaxNode[] nodes=new TaxNode[max+1];
|
jpayne@68
|
415 for(TaxNode n : list){
|
jpayne@68
|
416 assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n;
|
jpayne@68
|
417 nodes[n.id]=n;
|
jpayne@68
|
418 }
|
jpayne@68
|
419
|
jpayne@68
|
420 return nodes;
|
jpayne@68
|
421 }
|
jpayne@68
|
422
|
jpayne@68
|
423 /**
|
jpayne@68
|
424 * Parses names file a second time to fill in additional information.
|
jpayne@68
|
425 * Should really be merged into getNames.
|
jpayne@68
|
426 * @TODO Merge into getNames
|
jpayne@68
|
427 */
|
jpayne@68
|
428 private static TaxNode[] getNodes(String fname, TaxNode[] nodes){
|
jpayne@68
|
429
|
jpayne@68
|
430 int max=0;
|
jpayne@68
|
431
|
jpayne@68
|
432 LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>();
|
jpayne@68
|
433
|
jpayne@68
|
434 TextFile tf=new TextFile(fname, false);
|
jpayne@68
|
435 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
|
jpayne@68
|
436 String[] split=delimiter.split(s, 4);
|
jpayne@68
|
437 assert(split.length==4) : s;
|
jpayne@68
|
438 int id=-1, pid=-1, level=-1, levelExtended=-1;
|
jpayne@68
|
439
|
jpayne@68
|
440 id=Integer.parseInt(split[0]);
|
jpayne@68
|
441 try {
|
jpayne@68
|
442 pid=Integer.parseInt(split[1]);
|
jpayne@68
|
443 } catch (NumberFormatException e) {
|
jpayne@68
|
444 // TODO Auto-generated catch block
|
jpayne@68
|
445 e.printStackTrace();
|
jpayne@68
|
446 System.err.println("Bad line: "+s+"\n"+Arrays.toString(split));
|
jpayne@68
|
447 }
|
jpayne@68
|
448 boolean alt=false;
|
jpayne@68
|
449 {
|
jpayne@68
|
450 String key=split[2];
|
jpayne@68
|
451 Integer obj0=levelMap.get(key);
|
jpayne@68
|
452 Integer obj=levelMapExtended.get(key);
|
jpayne@68
|
453 assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split);
|
jpayne@68
|
454
|
jpayne@68
|
455 if(obj0==null){
|
jpayne@68
|
456 obj0=altLevelMap.get(key);
|
jpayne@68
|
457 alt=true;
|
jpayne@68
|
458 }
|
jpayne@68
|
459 if(obj0!=null){
|
jpayne@68
|
460 level=obj0;
|
jpayne@68
|
461 levelExtended=obj;
|
jpayne@68
|
462 if(id==pid){
|
jpayne@68
|
463 level=LIFE;
|
jpayne@68
|
464 levelExtended=LIFE_E;
|
jpayne@68
|
465 alt=false;
|
jpayne@68
|
466 }
|
jpayne@68
|
467 }else{
|
jpayne@68
|
468 if(id==pid){
|
jpayne@68
|
469 level=LIFE;
|
jpayne@68
|
470 levelExtended=LIFE_E;
|
jpayne@68
|
471 alt=false;
|
jpayne@68
|
472 }else{
|
jpayne@68
|
473 int[] count=oddNames.get(key);
|
jpayne@68
|
474 if(count==null){
|
jpayne@68
|
475 count=new int[1];
|
jpayne@68
|
476 oddNames.put(key, count);
|
jpayne@68
|
477 }
|
jpayne@68
|
478 count[0]++;
|
jpayne@68
|
479 }
|
jpayne@68
|
480 }
|
jpayne@68
|
481 }
|
jpayne@68
|
482 max=Tools.max(max, id);
|
jpayne@68
|
483 TaxNode n=nodes[id];
|
jpayne@68
|
484 assert(n!=null && n.pid<0) : n+" -> "+s;
|
jpayne@68
|
485 n.pid=pid;
|
jpayne@68
|
486 n.level=level;
|
jpayne@68
|
487 n.levelExtended=levelExtended;
|
jpayne@68
|
488 n.setOriginalLevel(levelExtended);
|
jpayne@68
|
489 n.setCanonical(!alt);
|
jpayne@68
|
490 assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n";
|
jpayne@68
|
491 }
|
jpayne@68
|
492
|
jpayne@68
|
493 if(oddNames.size()>0){
|
jpayne@68
|
494 outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:");
|
jpayne@68
|
495 if(verbose){
|
jpayne@68
|
496 for(String s : oddNames.keySet()){
|
jpayne@68
|
497 outstream.println(oddNames.get(s)[0]+"\t"+s);
|
jpayne@68
|
498 }
|
jpayne@68
|
499 }
|
jpayne@68
|
500 }
|
jpayne@68
|
501
|
jpayne@68
|
502 return nodes;
|
jpayne@68
|
503 }
|
jpayne@68
|
504
|
jpayne@68
|
505 /**
|
jpayne@68
|
506 * Count child nodes of each node.
|
jpayne@68
|
507 * This can be used to size arrays or determine which nodes are leaves.
|
jpayne@68
|
508 */
|
jpayne@68
|
509 private void countChildren(){
|
jpayne@68
|
510 for(TaxNode child : nodes){
|
jpayne@68
|
511 if(child!=null && child.pid!=child.id){
|
jpayne@68
|
512 TaxNode parent=getNode(child.pid);
|
jpayne@68
|
513 if(parent!=child){parent.numChildren++;}
|
jpayne@68
|
514 }
|
jpayne@68
|
515 }
|
jpayne@68
|
516 }
|
jpayne@68
|
517
|
jpayne@68
|
518 //TODO - This could be finished in 2 passes using the childTable.
|
jpayne@68
|
519 /**
|
jpayne@68
|
520 * Fill derived fields minParentLevelExtended and maxChildLevelExtended
|
jpayne@68
|
521 * by percolating information through the tree until a fixpoint is reached.
|
jpayne@68
|
522 * @TODO This could be finished in 2 passes using the childTable.
|
jpayne@68
|
523 * @return Number of rounds required to reach fixpoint.
|
jpayne@68
|
524 */
|
jpayne@68
|
525 private int percolate(){
|
jpayne@68
|
526 boolean changed=true;
|
jpayne@68
|
527 int rounds=0;
|
jpayne@68
|
528 while(changed){
|
jpayne@68
|
529 changed=false;
|
jpayne@68
|
530 rounds++;
|
jpayne@68
|
531 for(TaxNode child : nodes){
|
jpayne@68
|
532 if(child!=null && child.pid!=child.id){
|
jpayne@68
|
533 TaxNode parent=getNode(child.pid);
|
jpayne@68
|
534 changed=(child.discussWithParent(parent) | changed);
|
jpayne@68
|
535 }
|
jpayne@68
|
536 }
|
jpayne@68
|
537
|
jpayne@68
|
538 if(!changed){break;}
|
jpayne@68
|
539 changed=false;
|
jpayne@68
|
540 rounds++;
|
jpayne@68
|
541 for(int i=nodes.length-1; i>=0; i--){
|
jpayne@68
|
542 TaxNode child=nodes[i];
|
jpayne@68
|
543 if(child!=null && child.pid!=child.id){
|
jpayne@68
|
544 TaxNode parent=getNode(child.pid);
|
jpayne@68
|
545 changed=(child.discussWithParent(parent) | changed);
|
jpayne@68
|
546 }
|
jpayne@68
|
547 }
|
jpayne@68
|
548 }
|
jpayne@68
|
549 return rounds;
|
jpayne@68
|
550 }
|
jpayne@68
|
551
|
jpayne@68
|
552 /**
|
jpayne@68
|
553 * Load nodes into the nameMap and nameMapLower, mapped to their names.
|
jpayne@68
|
554 * @param genusDotSpecies Also hash abbreviations such as E.coli.
|
jpayne@68
|
555 */
|
jpayne@68
|
556 public synchronized void hashNames(boolean genusDotSpecies){
|
jpayne@68
|
557 if(nameMap!=null){return;}
|
jpayne@68
|
558 assert(nameMap==null);
|
jpayne@68
|
559 assert(nameMapLower==null);
|
jpayne@68
|
560 final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN));
|
jpayne@68
|
561 nameMap=new HashMap<String, ArrayList<TaxNode>>(size);
|
jpayne@68
|
562 nameMapLower=new HashMap<String, ArrayList<TaxNode>>(size);
|
jpayne@68
|
563
|
jpayne@68
|
564 //Hash the names, both lowercase and uppercase
|
jpayne@68
|
565 for(TaxNode n : nodes){
|
jpayne@68
|
566 if(n!=null){
|
jpayne@68
|
567 String name=n.name;
|
jpayne@68
|
568 if(name.indexOf('_')>=0){
|
jpayne@68
|
569 name=name.replace('_', ' ').trim();
|
jpayne@68
|
570 }
|
jpayne@68
|
571 if(name!=null && !name.equals("environmental samples")){
|
jpayne@68
|
572 {
|
jpayne@68
|
573 ArrayList<TaxNode> list=nameMap.get(name);
|
jpayne@68
|
574 if(list==null){
|
jpayne@68
|
575 list=new ArrayList<TaxNode>();
|
jpayne@68
|
576 nameMap.put(name, list);
|
jpayne@68
|
577 }
|
jpayne@68
|
578 list.add(n);
|
jpayne@68
|
579 }
|
jpayne@68
|
580 {
|
jpayne@68
|
581 String lc=name.toLowerCase();
|
jpayne@68
|
582 ArrayList<TaxNode> list=nameMapLower.get(lc);
|
jpayne@68
|
583 if(list==null){
|
jpayne@68
|
584 list=new ArrayList<TaxNode>();
|
jpayne@68
|
585 nameMapLower.put(lc, list);
|
jpayne@68
|
586 }
|
jpayne@68
|
587 list.add(n);
|
jpayne@68
|
588 }
|
jpayne@68
|
589 }
|
jpayne@68
|
590 }
|
jpayne@68
|
591 }
|
jpayne@68
|
592
|
jpayne@68
|
593 //Hash G.species versions of the names, both lowercase and uppercase
|
jpayne@68
|
594 if(genusDotSpecies){
|
jpayne@68
|
595 ByteBuilder bb=new ByteBuilder(64);
|
jpayne@68
|
596 for(TaxNode n : nodes){
|
jpayne@68
|
597 if(n!=null && n.levelExtended==SPECIES_E){
|
jpayne@68
|
598 String name=n.name;
|
jpayne@68
|
599 if(name.indexOf('_')>=0){
|
jpayne@68
|
600 name=name.replace('_', ' ').trim();
|
jpayne@68
|
601 }
|
jpayne@68
|
602 if(name!=null && !name.equals("environmental samples")){
|
jpayne@68
|
603 final String dotFormat=dotFormat(name, bb);
|
jpayne@68
|
604 if(dotFormat!=null){
|
jpayne@68
|
605 {
|
jpayne@68
|
606 ArrayList<TaxNode> list=nameMap.get(dotFormat);
|
jpayne@68
|
607 if(list==null){
|
jpayne@68
|
608 list=new ArrayList<TaxNode>();
|
jpayne@68
|
609 nameMap.put(dotFormat, list);
|
jpayne@68
|
610 }
|
jpayne@68
|
611 list.add(n);
|
jpayne@68
|
612 }
|
jpayne@68
|
613 {
|
jpayne@68
|
614 String lc=dotFormat.toLowerCase();
|
jpayne@68
|
615 ArrayList<TaxNode> list=nameMapLower.get(lc);
|
jpayne@68
|
616 if(list==null){
|
jpayne@68
|
617 list=new ArrayList<TaxNode>();
|
jpayne@68
|
618 nameMapLower.put(lc, list);
|
jpayne@68
|
619 }
|
jpayne@68
|
620 list.add(n);
|
jpayne@68
|
621 }
|
jpayne@68
|
622 }
|
jpayne@68
|
623 }
|
jpayne@68
|
624 }
|
jpayne@68
|
625 }
|
jpayne@68
|
626 }
|
jpayne@68
|
627 }
|
jpayne@68
|
628
|
jpayne@68
|
629 /**
|
jpayne@68
|
630 * Generate the "dot format" name of a node.
|
jpayne@68
|
631 * For example, transform "Homo sapiens" to "H.sapiens"
|
jpayne@68
|
632 * @param name Node name
|
jpayne@68
|
633 * @param buffer A ByteBuilder that may be modified
|
jpayne@68
|
634 * @return Dot format
|
jpayne@68
|
635 */
|
jpayne@68
|
636 private static String dotFormat(String name, ByteBuilder buffer){
|
jpayne@68
|
637 if(name==null || name.indexOf('.')>=0){return null;}
|
jpayne@68
|
638 final int firstSpace=name.indexOf(' ');
|
jpayne@68
|
639 if(firstSpace<0 || firstSpace>=name.length()-1){return null;}
|
jpayne@68
|
640 final int lastSpace=name.lastIndexOf(' ');
|
jpayne@68
|
641 if(firstSpace!=lastSpace){return null;}
|
jpayne@68
|
642 final String a=name.substring(0, firstSpace);
|
jpayne@68
|
643 final String b=name.substring(lastSpace+1);
|
jpayne@68
|
644 final char ca=a.charAt(0);
|
jpayne@68
|
645 final char cb=b.charAt(0);
|
jpayne@68
|
646 if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;}
|
jpayne@68
|
647 if(buffer==null){buffer=new ByteBuilder(2+b.length());}
|
jpayne@68
|
648 else{buffer.clear();}
|
jpayne@68
|
649 buffer.append(ca).append('.').append(b);
|
jpayne@68
|
650 return buffer.toString();
|
jpayne@68
|
651 }
|
jpayne@68
|
652
|
jpayne@68
|
653 /**
|
jpayne@68
|
654 * Fill childMap, which maps nodes to their children.
|
jpayne@68
|
655 */
|
jpayne@68
|
656 public synchronized void hashChildren(){
|
jpayne@68
|
657 assert(childMap==null);
|
jpayne@68
|
658 int nodesWithChildren=0;
|
jpayne@68
|
659 for(TaxNode tn : nodes){
|
jpayne@68
|
660 if(tn!=null && tn.numChildren>0){nodesWithChildren++;}
|
jpayne@68
|
661 }
|
jpayne@68
|
662 childMap=new HashMap<TaxNode, ArrayList<TaxNode>>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN));
|
jpayne@68
|
663 for(TaxNode tn : nodes){
|
jpayne@68
|
664 if(tn!=null){
|
jpayne@68
|
665 if(tn.numChildren>0){
|
jpayne@68
|
666 childMap.put(tn, new ArrayList<TaxNode>(tn.numChildren));
|
jpayne@68
|
667 }
|
jpayne@68
|
668 }
|
jpayne@68
|
669 }
|
jpayne@68
|
670 for(TaxNode tn : nodes){
|
jpayne@68
|
671 if(tn!=null){
|
jpayne@68
|
672 if(tn.id!=tn.pid){
|
jpayne@68
|
673 ArrayList<TaxNode> list=childMap.get(getNode(tn.pid));
|
jpayne@68
|
674 if(list!=null){list.add(tn);}
|
jpayne@68
|
675 }
|
jpayne@68
|
676 }
|
jpayne@68
|
677 }
|
jpayne@68
|
678 }
|
jpayne@68
|
679
|
jpayne@68
|
680 /**
|
jpayne@68
|
681 * Fetch this node's children.
|
jpayne@68
|
682 * @param parent Node in question
|
jpayne@68
|
683 * @return List of child nodes
|
jpayne@68
|
684 */
|
jpayne@68
|
685 public ArrayList<TaxNode> getChildren(TaxNode parent){
|
jpayne@68
|
686 if(parent.numChildren<1){return null;}
|
jpayne@68
|
687 if(childMap!=null){return childMap.get(parent);}
|
jpayne@68
|
688 ArrayList<TaxNode> list=new ArrayList<TaxNode>(parent.numChildren);
|
jpayne@68
|
689 for(TaxNode tn : nodes){
|
jpayne@68
|
690 if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){
|
jpayne@68
|
691 list.add(tn);
|
jpayne@68
|
692 }
|
jpayne@68
|
693 }
|
jpayne@68
|
694 return list;
|
jpayne@68
|
695 }
|
jpayne@68
|
696
|
jpayne@68
|
697 /**
|
jpayne@68
|
698 * Load a map of old to new TaxIDs.
|
jpayne@68
|
699 * @param mergedFile NCBI merged.txt.
|
jpayne@68
|
700 * @return Map of old to new TaxIDs
|
jpayne@68
|
701 */
|
jpayne@68
|
702 private static IntHashMap getMerged(String mergedFile) {
|
jpayne@68
|
703 if(mergedFile==null){return null;}
|
jpayne@68
|
704 String[] lines=TextFile.toStringLines(mergedFile);
|
jpayne@68
|
705 if(lines.length<1){return null;}
|
jpayne@68
|
706 IntHashMap map=new IntHashMap((int)(lines.length*1.3));
|
jpayne@68
|
707 for(String line : lines){
|
jpayne@68
|
708 String[] split=delimiterTab.split(line);
|
jpayne@68
|
709 int a=Integer.parseInt(split[0]);
|
jpayne@68
|
710 int b=Integer.parseInt(split[2]);
|
jpayne@68
|
711 map.put(a, b);
|
jpayne@68
|
712 }
|
jpayne@68
|
713 return map;
|
jpayne@68
|
714 }
|
jpayne@68
|
715
|
jpayne@68
|
716 /**
|
jpayne@68
|
717 * Simplify the tree by assigning ranks to unranked nodes,
|
jpayne@68
|
718 * where possible, through inference.
|
jpayne@68
|
719 * Optionally removes unranked nodes based on the skipNorank field.
|
jpayne@68
|
720 * @param nodes Array of all TaxNodes.
|
jpayne@68
|
721 * @return Number of nodes removed.
|
jpayne@68
|
722 */
|
jpayne@68
|
723 private int simplify(TaxNode nodes[]){
|
jpayne@68
|
724
|
jpayne@68
|
725 int failed=test(nodes);
|
jpayne@68
|
726
|
jpayne@68
|
727 int removed=0;
|
jpayne@68
|
728 int reassigned=0;
|
jpayne@68
|
729
|
jpayne@68
|
730 if(reassign){
|
jpayne@68
|
731 boolean changed=true;
|
jpayne@68
|
732 int changedCount=0;
|
jpayne@68
|
733 while(changed){
|
jpayne@68
|
734 changed=false;
|
jpayne@68
|
735 for(int i=0; i<nodes.length; i++){
|
jpayne@68
|
736 TaxNode n=nodes[i];
|
jpayne@68
|
737 if(n!=null && n.levelExtended<1){
|
jpayne@68
|
738 int pid=n.pid;
|
jpayne@68
|
739 TaxNode parent=nodes[pid];
|
jpayne@68
|
740 assert(parent!=null) : n;
|
jpayne@68
|
741 if(n.maxDescendantLevelIncludingSelf()<SUBSPECIES_E &&
|
jpayne@68
|
742 parent.minAncestorLevelIncludingSelf()<=SPECIES_E && parent.minAncestorLevelIncludingSelf()>=SUBSPECIES_E){
|
jpayne@68
|
743 // if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){
|
jpayne@68
|
744 changed=true;
|
jpayne@68
|
745 n.levelExtended=SUBSPECIES_E;
|
jpayne@68
|
746 n.level=SUBSPECIES;
|
jpayne@68
|
747 changedCount++;
|
jpayne@68
|
748 }
|
jpayne@68
|
749 }
|
jpayne@68
|
750 }
|
jpayne@68
|
751 }
|
jpayne@68
|
752 System.err.println("Assigned levels to "+changedCount+" unranked nodes.");
|
jpayne@68
|
753 }
|
jpayne@68
|
754
|
jpayne@68
|
755
|
jpayne@68
|
756 if(skipNorank){//Skip nodes with unknown taxa
|
jpayne@68
|
757 if(verbose){outstream.println("A0");}
|
jpayne@68
|
758
|
jpayne@68
|
759 for(int i=0; i<nodes.length; i++){
|
jpayne@68
|
760 TaxNode n=nodes[i];
|
jpayne@68
|
761 if(n!=null){
|
jpayne@68
|
762 int pid=n.pid;
|
jpayne@68
|
763 TaxNode parent=nodes[pid];
|
jpayne@68
|
764 assert(parent!=null) : n;
|
jpayne@68
|
765 assert(parent!=n || pid==1) : n+", "+pid;
|
jpayne@68
|
766 while(parent.levelExtended<1 && n.levelExtended>parent.levelExtended){
|
jpayne@68
|
767 //System.err.println("Reassigned from "+parent);
|
jpayne@68
|
768 assert(parent.id!=parent.pid);
|
jpayne@68
|
769 parent=nodes[parent.pid];
|
jpayne@68
|
770 n.pid=parent.id;
|
jpayne@68
|
771 reassigned++;
|
jpayne@68
|
772 }
|
jpayne@68
|
773 }
|
jpayne@68
|
774 }
|
jpayne@68
|
775
|
jpayne@68
|
776 for(int i=0; i<nodes.length; i++){
|
jpayne@68
|
777 if(nodes[i]!=null && nodes[i].levelExtended<0){
|
jpayne@68
|
778 System.err.println("Removed "+nodes[i]);
|
jpayne@68
|
779 nodes[i]=null;
|
jpayne@68
|
780 removed++;
|
jpayne@68
|
781 }
|
jpayne@68
|
782 }
|
jpayne@68
|
783 if(verbose){outstream.println("Skipped "+reassigned+" unranked parents, removed "+removed+" invalid nodes.");}
|
jpayne@68
|
784 }
|
jpayne@68
|
785
|
jpayne@68
|
786 if(inferRankLimit>0){//Infer level for unset nodes (from "no rank")
|
jpayne@68
|
787 if(verbose){outstream.println("A");}
|
jpayne@68
|
788 int changed=1;
|
jpayne@68
|
789 while(changed>0){
|
jpayne@68
|
790 changed=0;
|
jpayne@68
|
791 for(final TaxNode n : nodes){
|
jpayne@68
|
792 if(n!=null){
|
jpayne@68
|
793 if(n.levelExtended==0){
|
jpayne@68
|
794 TaxNode parent=nodes[n.pid];
|
jpayne@68
|
795 if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){
|
jpayne@68
|
796 n.levelExtended=Tools.max(1, parent.levelExtended-1);
|
jpayne@68
|
797 assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit);
|
jpayne@68
|
798 changed++;
|
jpayne@68
|
799 }
|
jpayne@68
|
800 }
|
jpayne@68
|
801 }
|
jpayne@68
|
802 }
|
jpayne@68
|
803 if(verbose){outstream.println("changed: "+changed);}
|
jpayne@68
|
804 }
|
jpayne@68
|
805
|
jpayne@68
|
806 // outstream.println("B");
|
jpayne@68
|
807 // for(TaxNode n : nodes){
|
jpayne@68
|
808 // if(n!=null && n.level==0){
|
jpayne@68
|
809 // n.level=-1;
|
jpayne@68
|
810 // }
|
jpayne@68
|
811 // }
|
jpayne@68
|
812 }
|
jpayne@68
|
813
|
jpayne@68
|
814 failed=test(nodes);
|
jpayne@68
|
815
|
jpayne@68
|
816 // if(reassign){//Skip nodes with duplicate taxa
|
jpayne@68
|
817 // if(verbose){outstream.println("D");}
|
jpayne@68
|
818 // int changed=1;
|
jpayne@68
|
819 // while(changed>0){
|
jpayne@68
|
820 // changed=0;
|
jpayne@68
|
821 // for(final TaxNode n : nodes){
|
jpayne@68
|
822 // if(n!=null){
|
jpayne@68
|
823 // TaxNode parent=nodes[n.pid];
|
jpayne@68
|
824 // TaxNode grandparent=nodes[parent.pid];
|
jpayne@68
|
825 // assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent;
|
jpayne@68
|
826 // assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent;
|
jpayne@68
|
827 //
|
jpayne@68
|
828 // while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) ||
|
jpayne@68
|
829 // n.level>parent.level || (n.level==parent.level))){
|
jpayne@68
|
830 // parent=grandparent;
|
jpayne@68
|
831 // grandparent=nodes[parent.pid];
|
jpayne@68
|
832 // n.pid=parent.id;
|
jpayne@68
|
833 // reassigned++;
|
jpayne@68
|
834 // changed++;
|
jpayne@68
|
835 // }
|
jpayne@68
|
836 // }
|
jpayne@68
|
837 // }
|
jpayne@68
|
838 // if(verbose){outstream.println("changed: "+changed);}
|
jpayne@68
|
839 // }
|
jpayne@68
|
840 // if(verbose){outstream.println("E");}
|
jpayne@68
|
841 // for(int i=0; i<nodes.length; i++){
|
jpayne@68
|
842 // if(nodes[i]!=null && nodes[i].level<0){
|
jpayne@68
|
843 // nodes[i]=null;
|
jpayne@68
|
844 // removed++;
|
jpayne@68
|
845 // }
|
jpayne@68
|
846 // }
|
jpayne@68
|
847 // }
|
jpayne@68
|
848
|
jpayne@68
|
849 failed=test(nodes);
|
jpayne@68
|
850
|
jpayne@68
|
851 if(verbose){outstream.println("F");}
|
jpayne@68
|
852 {//Ensure the tree is now clean
|
jpayne@68
|
853 for(int i=0; i<nodes.length; i++){
|
jpayne@68
|
854 TaxNode n=nodes[i];
|
jpayne@68
|
855 if(n!=null){
|
jpayne@68
|
856 TaxNode parent=nodes[n.pid];
|
jpayne@68
|
857 TaxNode grandparent=nodes[parent.pid];
|
jpayne@68
|
858 assert(n==parent || n.levelExtended<=parent.levelExtended || !n.canonical() || n.levelExtended<1 || parent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent;
|
jpayne@68
|
859 assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent;
|
jpayne@68
|
860 }
|
jpayne@68
|
861 }
|
jpayne@68
|
862 }
|
jpayne@68
|
863
|
jpayne@68
|
864 // if(verbose){System.err.println("Reassignments: "+reassigned);}
|
jpayne@68
|
865
|
jpayne@68
|
866 return removed;
|
jpayne@68
|
867 }
|
jpayne@68
|
868
|
jpayne@68
|
869 /*--------------------------------------------------------------*/
|
jpayne@68
|
870 /*---------------- Validation ----------------*/
|
jpayne@68
|
871 /*--------------------------------------------------------------*/
|
jpayne@68
|
872
|
jpayne@68
|
873 /**
|
jpayne@68
|
874 * Ensure tree has monotonically increasing (or nondescending) ranks.
|
jpayne@68
|
875 * @param nodes All TaxNodes.
|
jpayne@68
|
876 * @return Number of violations.
|
jpayne@68
|
877 */
|
jpayne@68
|
878 private static int test(TaxNode[] nodes){
|
jpayne@68
|
879 int failed=0;
|
jpayne@68
|
880 for(final TaxNode n : nodes){
|
jpayne@68
|
881 if(n!=null){
|
jpayne@68
|
882 TaxNode parent=nodes[n.pid];
|
jpayne@68
|
883 try {
|
jpayne@68
|
884 assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) :
|
jpayne@68
|
885 "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n"
|
jpayne@68
|
886 + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended;
|
jpayne@68
|
887 assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent;
|
jpayne@68
|
888 // assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical() || !parent.canonical()) : n+" -> "+parent;
|
jpayne@68
|
889 if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){
|
jpayne@68
|
890 if(verbose){outstream.println("Error: "+n+" -> "+parent);}
|
jpayne@68
|
891 failed++;
|
jpayne@68
|
892 }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){
|
jpayne@68
|
893 // if(verbose){outstream.println("Error: "+n+" -> "+parent);}
|
jpayne@68
|
894 // failed++;
|
jpayne@68
|
895 }
|
jpayne@68
|
896 assert(n!=parent || n.id<=1) : n;
|
jpayne@68
|
897 } catch (Throwable e) {
|
jpayne@68
|
898 // TODO Auto-generated catch block
|
jpayne@68
|
899 e.printStackTrace();
|
jpayne@68
|
900 failed++;
|
jpayne@68
|
901 }
|
jpayne@68
|
902 }
|
jpayne@68
|
903 }
|
jpayne@68
|
904 if(verbose || failed>0){outstream.println(failed+" nodes failed.");}
|
jpayne@68
|
905 return failed;
|
jpayne@68
|
906 }
|
jpayne@68
|
907
|
jpayne@68
|
908 /*--------------------------------------------------------------*/
|
jpayne@68
|
909 /*---------------- Print Methods ----------------*/
|
jpayne@68
|
910 /*--------------------------------------------------------------*/
|
jpayne@68
|
911
|
jpayne@68
|
912 /**
|
jpayne@68
|
913 * Format full name in semicolon format, e.g.
|
jpayne@68
|
914 * "SK:Bacteria;P:Protobacteria;..."
|
jpayne@68
|
915 * @param tn0 Base node
|
jpayne@68
|
916 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
|
jpayne@68
|
917 * @return Resultant String
|
jpayne@68
|
918 */
|
jpayne@68
|
919 public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){
|
jpayne@68
|
920 StringBuilder sb=new StringBuilder();
|
jpayne@68
|
921 if(tn0==null){return "Not found";}
|
jpayne@68
|
922 String semi="";
|
jpayne@68
|
923 ArrayList<TaxNode> list=toAncestors(tn0, skipNonCanonical);
|
jpayne@68
|
924 boolean addTaxLevel=true;
|
jpayne@68
|
925 for(int i=list.size()-1; i>=0; i--){
|
jpayne@68
|
926 sb.append(semi);
|
jpayne@68
|
927 TaxNode tn=list.get(i);
|
jpayne@68
|
928 if(tn.id!=LIFE_ID || list.size()==1){
|
jpayne@68
|
929 if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){
|
jpayne@68
|
930 sb.append(tn.levelToStringShort()).append(':');
|
jpayne@68
|
931 }
|
jpayne@68
|
932 sb.append(mononomial ? mononomial(tn) : tn.name);
|
jpayne@68
|
933 semi=";";
|
jpayne@68
|
934 }
|
jpayne@68
|
935 }
|
jpayne@68
|
936 return sb.toString();
|
jpayne@68
|
937 }
|
jpayne@68
|
938
|
jpayne@68
|
939 /**
|
jpayne@68
|
940 * Return a list of TaxIDs of all ancestors.
|
jpayne@68
|
941 * @param tn0 Base node
|
jpayne@68
|
942 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
|
jpayne@68
|
943 * @return List of TaxIDs.
|
jpayne@68
|
944 */
|
jpayne@68
|
945 public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){
|
jpayne@68
|
946 if(tn0==null){return null;}
|
jpayne@68
|
947 IntList list=new IntList(8);
|
jpayne@68
|
948
|
jpayne@68
|
949 TaxNode tn=tn0;
|
jpayne@68
|
950 while(tn!=null){
|
jpayne@68
|
951 if(!skipNonCanonical || tn.isSimple()){
|
jpayne@68
|
952 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);}
|
jpayne@68
|
953 }
|
jpayne@68
|
954 if(tn.pid==tn.id){break;}
|
jpayne@68
|
955 tn=getNode(tn.pid);
|
jpayne@68
|
956 }
|
jpayne@68
|
957 if(list.isEmpty()){list.add(tn0.id);}
|
jpayne@68
|
958 return list;
|
jpayne@68
|
959 }
|
jpayne@68
|
960
|
jpayne@68
|
961 /**
|
jpayne@68
|
962 * Return a list of all ancestors.
|
jpayne@68
|
963 * @param tn0 Base node
|
jpayne@68
|
964 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe.
|
jpayne@68
|
965 * @return List of ancestor nodes.
|
jpayne@68
|
966 */
|
jpayne@68
|
967 public ArrayList<TaxNode> toAncestors(final TaxNode tn0, boolean skipNonCanonical){
|
jpayne@68
|
968 if(tn0==null){return null;}
|
jpayne@68
|
969 ArrayList<TaxNode> list=new ArrayList<TaxNode>(8);
|
jpayne@68
|
970
|
jpayne@68
|
971 TaxNode tn=tn0;
|
jpayne@68
|
972 while(tn!=null){
|
jpayne@68
|
973 if(!skipNonCanonical || tn.isSimple()){
|
jpayne@68
|
974 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);}
|
jpayne@68
|
975 }
|
jpayne@68
|
976 if(tn.pid==tn.id){break;}
|
jpayne@68
|
977 tn=getNode(tn.pid);
|
jpayne@68
|
978 }
|
jpayne@68
|
979 if(list.isEmpty()){list.add(tn0);}
|
jpayne@68
|
980 return list;
|
jpayne@68
|
981 }
|
jpayne@68
|
982
|
jpayne@68
|
983 /**
|
jpayne@68
|
984 * Generate a path to the genome of an organism on the filesystem;
|
jpayne@68
|
985 * used by ExplodeTree. Intended for internal JGI use.
|
jpayne@68
|
986 * @param root Location of the exploded tree.
|
jpayne@68
|
987 * @return Path to a genome.
|
jpayne@68
|
988 */
|
jpayne@68
|
989 public String toDir(TaxNode node, String root){
|
jpayne@68
|
990 StringBuilder sb=new StringBuilder();
|
jpayne@68
|
991 if(root==null){root="";}
|
jpayne@68
|
992 sb.append(root);
|
jpayne@68
|
993 if(root.length()>0 && !root.endsWith("/")){sb.append('/');}
|
jpayne@68
|
994 IntList list=toAncestorIds(node, false);
|
jpayne@68
|
995 list.reverse();
|
jpayne@68
|
996 assert(list.get(0)==1) : list + "," +getNode(list.get(0));
|
jpayne@68
|
997 for(int i=0; i<list.size(); i++){
|
jpayne@68
|
998 sb.append(list.get(i));
|
jpayne@68
|
999 sb.append('/');
|
jpayne@68
|
1000 }
|
jpayne@68
|
1001 return sb.toString();
|
jpayne@68
|
1002 }
|
jpayne@68
|
1003
|
jpayne@68
|
1004 /*--------------------------------------------------------------*/
|
jpayne@68
|
1005 /*---------------- Outer Methods ----------------*/
|
jpayne@68
|
1006 /*--------------------------------------------------------------*/
|
jpayne@68
|
1007
|
jpayne@68
|
1008 /**
|
jpayne@68
|
1009 * Use various techniques to get a TaxID from an unknown String, such as parsing,
|
jpayne@68
|
1010 * name lookups, and accession translation.
|
jpayne@68
|
1011 * @param s String to process.
|
jpayne@68
|
1012 * @return Decoded TaxID.
|
jpayne@68
|
1013 */
|
jpayne@68
|
1014 public static int getID(String s){return GiToTaxid.getID(s);}
|
jpayne@68
|
1015
|
jpayne@68
|
1016 /**
|
jpayne@68
|
1017 * Use various techniques to get a TaxID from an unknown byte[], such as parsing,
|
jpayne@68
|
1018 * name lookups, and accession translation.
|
jpayne@68
|
1019 * @param s String to process.
|
jpayne@68
|
1020 * @return Decoded TaxID.
|
jpayne@68
|
1021 */
|
jpayne@68
|
1022 public static int getID(byte[] s){return GiToTaxid.getID(s);}
|
jpayne@68
|
1023
|
jpayne@68
|
1024 /** Return the lowest ancestor of the named node with taxonomic level at least minLevel */
|
jpayne@68
|
1025 public TaxNode getNode(String s, int minLevelExtended){
|
jpayne@68
|
1026 TaxNode tn=parseNodeFromHeader(s, true);
|
jpayne@68
|
1027 while(tn!=null && tn.levelExtended<minLevelExtended && tn.pid!=tn.id){
|
jpayne@68
|
1028 tn=getNode(tn.pid);
|
jpayne@68
|
1029 }
|
jpayne@68
|
1030 return tn;
|
jpayne@68
|
1031 }
|
jpayne@68
|
1032
|
jpayne@68
|
1033 /**
|
jpayne@68
|
1034 * Determine whether a node is a descendant of another.
|
jpayne@68
|
1035 * @param child Possible child TaxID.
|
jpayne@68
|
1036 * @param parent Possible parent TaxID.
|
jpayne@68
|
1037 * @return true iff child descends from parent.
|
jpayne@68
|
1038 */
|
jpayne@68
|
1039 public boolean descendsFrom(final int child, final int parent){
|
jpayne@68
|
1040 TaxNode cn=getNode(child), pn=getNode(parent);
|
jpayne@68
|
1041 assert(cn!=null) : "Invalid taxID: "+child;
|
jpayne@68
|
1042 assert(pn!=null) : "Invalid taxID: "+parent;
|
jpayne@68
|
1043 return descendsFrom(cn, pn);
|
jpayne@68
|
1044 }
|
jpayne@68
|
1045
|
jpayne@68
|
1046 /**
|
jpayne@68
|
1047 * Determine whether a node is a descendant of another.
|
jpayne@68
|
1048 * @param child Possible child node.
|
jpayne@68
|
1049 * @param parent Possible parent node.
|
jpayne@68
|
1050 * @return true iff child descends from parent.
|
jpayne@68
|
1051 */
|
jpayne@68
|
1052 public boolean descendsFrom(TaxNode child, TaxNode parent){
|
jpayne@68
|
1053 assert(child!=null && parent!=null) : "Null parameters.";
|
jpayne@68
|
1054 if(child==null || parent==null){return false;}
|
jpayne@68
|
1055
|
jpayne@68
|
1056 while(child!=parent && child.levelExtended<=parent.levelExtended && child.id!=child.pid){
|
jpayne@68
|
1057 child=getNode(child.pid);
|
jpayne@68
|
1058 }
|
jpayne@68
|
1059 return child==parent;
|
jpayne@68
|
1060 }
|
jpayne@68
|
1061
|
jpayne@68
|
1062 /**
|
jpayne@68
|
1063 * Determine whether an organism is classified as X.
|
jpayne@68
|
1064 * @param taxID taxID of organism.
|
jpayne@68
|
1065 * @param ancestorID taxID of possible ancestor.
|
jpayne@68
|
1066 * @return true if the organism is an X.
|
jpayne@68
|
1067 */
|
jpayne@68
|
1068 public boolean descendsFrom2(int taxID, final int ancestorID) {
|
jpayne@68
|
1069 TaxNode tn=getNode(taxID);
|
jpayne@68
|
1070 while(tn.id!=tn.pid){
|
jpayne@68
|
1071 if(tn.id==ancestorID){return true;}
|
jpayne@68
|
1072 tn=getNode(tn.pid);
|
jpayne@68
|
1073 }
|
jpayne@68
|
1074 return false;
|
jpayne@68
|
1075 }
|
jpayne@68
|
1076
|
jpayne@68
|
1077 /** Determine whether an organism is classified as a plant. */
|
jpayne@68
|
1078 public boolean isPlant(int taxID) {return descendsFrom2(taxID, VIRIDIPLANTAE_ID);}
|
jpayne@68
|
1079
|
jpayne@68
|
1080 /** Determine whether an organism is classified as an animal. */
|
jpayne@68
|
1081 public boolean isAnimal(int taxID) {return descendsFrom2(taxID, METAZOA_ID);}
|
jpayne@68
|
1082
|
jpayne@68
|
1083 /** Determine whether an organism is classified as a fungus. */
|
jpayne@68
|
1084 public boolean isFungus(int taxID) {return descendsFrom2(taxID, FUNGI_ID);}
|
jpayne@68
|
1085
|
jpayne@68
|
1086 /** Determine whether an organism is classified as a eukaryote. */
|
jpayne@68
|
1087 public boolean isEukaryote(int taxID) {return descendsFrom2(taxID, EUKARYOTA_ID);}
|
jpayne@68
|
1088
|
jpayne@68
|
1089 /** Determine whether an organism is classified as a prokaryote. */
|
jpayne@68
|
1090 public boolean isProkaryote(int taxID) {
|
jpayne@68
|
1091 TaxNode tn=getNode(taxID);
|
jpayne@68
|
1092 if(tn==null){
|
jpayne@68
|
1093 System.err.println("*** Warning: Can't find node "+taxID+" ***");
|
jpayne@68
|
1094 return false;
|
jpayne@68
|
1095 }
|
jpayne@68
|
1096 while(tn.id!=tn.pid){
|
jpayne@68
|
1097 if(tn.id==BACTERIA_ID || tn.id==ARCHAEA_ID){return true;}
|
jpayne@68
|
1098 tn=getNode(tn.pid);
|
jpayne@68
|
1099 }
|
jpayne@68
|
1100 return false;
|
jpayne@68
|
1101 }
|
jpayne@68
|
1102
|
jpayne@68
|
1103 /**
|
jpayne@68
|
1104 * Calculate the common ancestor of two nodes.
|
jpayne@68
|
1105 * @param a TaxID of a node.
|
jpayne@68
|
1106 * @param b TaxID of a node.
|
jpayne@68
|
1107 * @return Common ancestor ID of a and b.
|
jpayne@68
|
1108 */
|
jpayne@68
|
1109 public int commonAncestor(final int a, final int b){
|
jpayne@68
|
1110 TaxNode an=getNode(a), bn=getNode(b);
|
jpayne@68
|
1111 assert(an!=null) : "Invalid taxID: "+a;
|
jpayne@68
|
1112 assert(bn!=null) : "Invalid taxID: "+b;
|
jpayne@68
|
1113 TaxNode cn=commonAncestor(an, bn);
|
jpayne@68
|
1114 assert(cn!=null) : "No common ancestor: "+an+", "+bn;
|
jpayne@68
|
1115 if(cn==null){return -1;}
|
jpayne@68
|
1116 return cn.id;
|
jpayne@68
|
1117 }
|
jpayne@68
|
1118
|
jpayne@68
|
1119 /**
|
jpayne@68
|
1120 * Calculate the common ancestor of two nodes.
|
jpayne@68
|
1121 * @param a A node.
|
jpayne@68
|
1122 * @param b A node.
|
jpayne@68
|
1123 * @return Common ancestor of a and b.
|
jpayne@68
|
1124 */
|
jpayne@68
|
1125 public TaxNode commonAncestor(TaxNode a, TaxNode b){
|
jpayne@68
|
1126 assert(a!=null && b!=null) : "Null parameters.";
|
jpayne@68
|
1127 if(a==null){return b;}
|
jpayne@68
|
1128 if(b==null){return a;}
|
jpayne@68
|
1129
|
jpayne@68
|
1130 while(a!=b){
|
jpayne@68
|
1131 if(a.levelExtended<b.levelExtended){
|
jpayne@68
|
1132 a=getNode(a.pid);
|
jpayne@68
|
1133 }else{
|
jpayne@68
|
1134 b=getNode(b.pid);
|
jpayne@68
|
1135 }
|
jpayne@68
|
1136 }
|
jpayne@68
|
1137 return a;
|
jpayne@68
|
1138 }
|
jpayne@68
|
1139
|
jpayne@68
|
1140 /**
|
jpayne@68
|
1141 * Identify the highest ancestor of a node;
|
jpayne@68
|
1142 * this will presumably be "Life".
|
jpayne@68
|
1143 * @param a Node
|
jpayne@68
|
1144 * @return Highest ancestor
|
jpayne@68
|
1145 */
|
jpayne@68
|
1146 public TaxNode highestAncestor(TaxNode a){
|
jpayne@68
|
1147 assert(a!=null);
|
jpayne@68
|
1148 while(a.id!=a.pid){a=getNode(a.pid);}
|
jpayne@68
|
1149 return a;
|
jpayne@68
|
1150 }
|
jpayne@68
|
1151
|
jpayne@68
|
1152 /*--------------------------------------------------------------*/
|
jpayne@68
|
1153 /*---------------- Header Parsing ----------------*/
|
jpayne@68
|
1154 /*--------------------------------------------------------------*/
|
jpayne@68
|
1155
|
jpayne@68
|
1156 /**
|
jpayne@68
|
1157 * Determine the TaxID of a String,
|
jpayne@68
|
1158 * without a loaded TaxTree.
|
jpayne@68
|
1159 * This only works if the literal TaxID is embedded in the String.
|
jpayne@68
|
1160 * @param header Typically a sequence header
|
jpayne@68
|
1161 * @return Decoded TaxID, or -1 if unsuccessful
|
jpayne@68
|
1162 */
|
jpayne@68
|
1163 public static int parseHeaderStatic(String header){
|
jpayne@68
|
1164 if(header.length()<3){return -1;}
|
jpayne@68
|
1165 if(header.charAt(0)=='>'){header=header.substring(1);}
|
jpayne@68
|
1166 if(!header.startsWith("tid|")){return -1;}
|
jpayne@68
|
1167 int idx=3;
|
jpayne@68
|
1168 int idx2=header.indexOf('|', 4);
|
jpayne@68
|
1169 if(idx2<5){return -1;}
|
jpayne@68
|
1170 int id=-1;
|
jpayne@68
|
1171 try {
|
jpayne@68
|
1172 id=Parse.parseInt(header, idx+1, idx2);
|
jpayne@68
|
1173 // System.err.println("d"+", "+header.substring(idx+1, idx2));
|
jpayne@68
|
1174 } catch (Throwable e) {
|
jpayne@68
|
1175 // System.err.println("e"+", "+header.substring(idx+1, idx2));
|
jpayne@68
|
1176 //ignore
|
jpayne@68
|
1177 }
|
jpayne@68
|
1178 return id;
|
jpayne@68
|
1179 }
|
jpayne@68
|
1180
|
jpayne@68
|
1181 /**
|
jpayne@68
|
1182 * Determine the TaxID of a String.
|
jpayne@68
|
1183 * @param header Typically a sequence header
|
jpayne@68
|
1184 * @param bestEffort In some cases, try certain substrings if the name is not found.
|
jpayne@68
|
1185 * @return
|
jpayne@68
|
1186 */
|
jpayne@68
|
1187 public TaxNode parseNodeFromHeader(String header, boolean bestEffort){
|
jpayne@68
|
1188 if(header==null || header.length()<2){return null;}
|
jpayne@68
|
1189 if(header.charAt(0)=='>'){header=header.substring(1);}
|
jpayne@68
|
1190 TaxNode tn;
|
jpayne@68
|
1191 if(SILVA_MODE){
|
jpayne@68
|
1192 tn=getNodeSilva(header, bestEffort);
|
jpayne@68
|
1193 }else if(UNITE_MODE){
|
jpayne@68
|
1194 tn=getNodeUnite(header, bestEffort);
|
jpayne@68
|
1195 }else{
|
jpayne@68
|
1196 final char delimiter=ncbiHeaderDelimiter(header);
|
jpayne@68
|
1197 if(delimiter==' '){
|
jpayne@68
|
1198 tn=getNodeNewStyle(header);
|
jpayne@68
|
1199 }else{
|
jpayne@68
|
1200 tn=getNodeOldStyle(header, delimiter);
|
jpayne@68
|
1201 if(tn==null && delimiter=='|'){
|
jpayne@68
|
1202 // System.err.println("A: "+header);
|
jpayne@68
|
1203 int id=-1;
|
jpayne@68
|
1204 String[] split=header.split("\\|");
|
jpayne@68
|
1205 if(AccessionToTaxid.LOADED()){
|
jpayne@68
|
1206 for(int i=0; i<split.length && id<0; i++){//Try accessions first
|
jpayne@68
|
1207 if(AccessionToTaxid.isValidAccession(split[i])){
|
jpayne@68
|
1208 id=AccessionToTaxid.get(split[i]);
|
jpayne@68
|
1209 }
|
jpayne@68
|
1210 }
|
jpayne@68
|
1211 }
|
jpayne@68
|
1212 for(int i=0; i<split.length && id<0; i++){//Then names
|
jpayne@68
|
1213 id=parseNameToTaxid(split[i]);
|
jpayne@68
|
1214 }
|
jpayne@68
|
1215 // System.err.println("E: "+id);
|
jpayne@68
|
1216 if(id>=0){tn=getNode(id);}
|
jpayne@68
|
1217 // System.err.println("F: "+tn);
|
jpayne@68
|
1218 }
|
jpayne@68
|
1219 }
|
jpayne@68
|
1220 }
|
jpayne@68
|
1221 return tn;
|
jpayne@68
|
1222 }
|
jpayne@68
|
1223
|
jpayne@68
|
1224 /**
|
jpayne@68
|
1225 * Guess the delimiter character in a String;
|
jpayne@68
|
1226 * typically assumed to be '|', '~', or ' '.
|
jpayne@68
|
1227 */
|
jpayne@68
|
1228 public static char ncbiHeaderDelimiter(String header){
|
jpayne@68
|
1229 for(int i=0; i<header.length(); i++){
|
jpayne@68
|
1230 final char c=header.charAt(i);
|
jpayne@68
|
1231 if(c=='|' || c=='~'){
|
jpayne@68
|
1232 assert(i>0) : "i="+i+"; malformatted header '"+header+"'";
|
jpayne@68
|
1233 return c;
|
jpayne@68
|
1234 }else if(Character.isWhitespace(c)){
|
jpayne@68
|
1235 return ' ';
|
jpayne@68
|
1236 }
|
jpayne@68
|
1237 }
|
jpayne@68
|
1238 return ' ';
|
jpayne@68
|
1239 }
|
jpayne@68
|
1240
|
jpayne@68
|
1241 /**
|
jpayne@68
|
1242 * Parse a Silva header to a Node.
|
jpayne@68
|
1243 * @param s Silva header.
|
jpayne@68
|
1244 * @param bestEffort Try certain substrings if the name is not found.
|
jpayne@68
|
1245 * @return Node
|
jpayne@68
|
1246 */
|
jpayne@68
|
1247 TaxNode getNodeSilva(String s, boolean bestEffort){
|
jpayne@68
|
1248 if(s==null){return null;}
|
jpayne@68
|
1249 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){
|
jpayne@68
|
1250 return getNodeOldStyle(s, s.charAt(3));
|
jpayne@68
|
1251 }
|
jpayne@68
|
1252 String[] split=Tools.semiPattern.split(s);
|
jpayne@68
|
1253
|
jpayne@68
|
1254 int number=-1;
|
jpayne@68
|
1255 // final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast"));
|
jpayne@68
|
1256 // if(chloroplast){return null;}
|
jpayne@68
|
1257 for(int i=split.length-1; number<0 && i>=0; i--){
|
jpayne@68
|
1258 String last=split[i];
|
jpayne@68
|
1259 int paren=last.indexOf('(');
|
jpayne@68
|
1260 if(paren>=0){last=last.substring(0, paren);}
|
jpayne@68
|
1261 last=last.trim();
|
jpayne@68
|
1262
|
jpayne@68
|
1263 if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){
|
jpayne@68
|
1264 number=parseNameToTaxid(last);
|
jpayne@68
|
1265 }
|
jpayne@68
|
1266
|
jpayne@68
|
1267 if(number>=0){return getNode(number);}
|
jpayne@68
|
1268 else if(!bestEffort){break;}
|
jpayne@68
|
1269 }
|
jpayne@68
|
1270 return null;
|
jpayne@68
|
1271 }
|
jpayne@68
|
1272
|
jpayne@68
|
1273 /**
|
jpayne@68
|
1274 * Parse a Unite header to a Node.
|
jpayne@68
|
1275 * @param s Unite header.
|
jpayne@68
|
1276 * @param bestEffort Try certain substrings if the name is not found.
|
jpayne@68
|
1277 * @return Node
|
jpayne@68
|
1278 */
|
jpayne@68
|
1279 TaxNode getNodeUnite(String s, boolean bestEffort){
|
jpayne@68
|
1280 if(s==null){return null;}
|
jpayne@68
|
1281 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){
|
jpayne@68
|
1282 return getNodeOldStyle(s, s.charAt(3));
|
jpayne@68
|
1283 }
|
jpayne@68
|
1284 String[] split=Tools.pipePattern.split(s);
|
jpayne@68
|
1285
|
jpayne@68
|
1286 int number=-1;
|
jpayne@68
|
1287 String name=split[0];
|
jpayne@68
|
1288 String acc=split[1];
|
jpayne@68
|
1289 if(AccessionToTaxid.LOADED() && acc.length()>0){
|
jpayne@68
|
1290 number=AccessionToTaxid.get(acc);
|
jpayne@68
|
1291 }
|
jpayne@68
|
1292 if(number<1){
|
jpayne@68
|
1293 TaxNode tn=getNodeByName(name);
|
jpayne@68
|
1294 if(tn!=null){number=tn.id;}
|
jpayne@68
|
1295 }
|
jpayne@68
|
1296
|
jpayne@68
|
1297 if(number>=0){return getNode(number);}
|
jpayne@68
|
1298 return null;
|
jpayne@68
|
1299 }
|
jpayne@68
|
1300
|
jpayne@68
|
1301 /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */
|
jpayne@68
|
1302 private TaxNode getNodeOldStyle(final String s, char delimiter){
|
jpayne@68
|
1303 {
|
jpayne@68
|
1304 int index=s.indexOf(delimiter);
|
jpayne@68
|
1305 if(index<0){
|
jpayne@68
|
1306 delimiter='~';
|
jpayne@68
|
1307 index=s.indexOf(delimiter);
|
jpayne@68
|
1308 if(index<0){
|
jpayne@68
|
1309 delimiter='_';
|
jpayne@68
|
1310 index=s.indexOf(delimiter);
|
jpayne@68
|
1311 }
|
jpayne@68
|
1312 }
|
jpayne@68
|
1313 int number=-1;
|
jpayne@68
|
1314
|
jpayne@68
|
1315 Throwable e=null;
|
jpayne@68
|
1316
|
jpayne@68
|
1317 if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){
|
jpayne@68
|
1318 // System.err.println("Parsing gi number.");
|
jpayne@68
|
1319
|
jpayne@68
|
1320 if(GiToTaxid.isInitialized()){
|
jpayne@68
|
1321 try {
|
jpayne@68
|
1322 number=GiToTaxid.parseGiToTaxid(s, delimiter);
|
jpayne@68
|
1323 } catch (Throwable e2) {
|
jpayne@68
|
1324 e=e2;
|
jpayne@68
|
1325 }
|
jpayne@68
|
1326 }else{
|
jpayne@68
|
1327 assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s;
|
jpayne@68
|
1328 }
|
jpayne@68
|
1329 // if(number!=-1){System.err.println("number="+number);}
|
jpayne@68
|
1330 }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){
|
jpayne@68
|
1331 // System.err.println("Parsing ncbi number.");
|
jpayne@68
|
1332 number=GiToTaxid.parseTaxidNumber(s, delimiter);
|
jpayne@68
|
1333 }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){
|
jpayne@68
|
1334 // System.err.println("Parsing ncbi number.");
|
jpayne@68
|
1335 long img=parseDelimitedNumber(s, delimiter);
|
jpayne@68
|
1336 ImgRecord record=imgMap.get(img);
|
jpayne@68
|
1337 number=(record==null ? -1 : record.taxID);
|
jpayne@68
|
1338 }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete
|
jpayne@68
|
1339 // System.err.println("Parsing ncbi number.");
|
jpayne@68
|
1340 number=GiToTaxid.parseTaxidNumber(s, delimiter);
|
jpayne@68
|
1341 }
|
jpayne@68
|
1342
|
jpayne@68
|
1343 if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){
|
jpayne@68
|
1344 String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s));
|
jpayne@68
|
1345 if(AccessionToTaxid.LOADED()){
|
jpayne@68
|
1346 number=parseAccessionToTaxid(split);
|
jpayne@68
|
1347 }
|
jpayne@68
|
1348 if(number<0){
|
jpayne@68
|
1349 number=parseHeaderNameToTaxid(split);
|
jpayne@68
|
1350 }
|
jpayne@68
|
1351 }
|
jpayne@68
|
1352
|
jpayne@68
|
1353 if(number<0 && e!=null){
|
jpayne@68
|
1354 assert(false) : e;
|
jpayne@68
|
1355 throw new RuntimeException(e);
|
jpayne@68
|
1356 }
|
jpayne@68
|
1357
|
jpayne@68
|
1358 //TaxServer code could go here...
|
jpayne@68
|
1359
|
jpayne@68
|
1360 if(number>=0){return getNode(number);}
|
jpayne@68
|
1361 }
|
jpayne@68
|
1362 if(verbose){System.err.println("Can't process name "+s);}
|
jpayne@68
|
1363 if(Tools.isDigit(s.charAt(0)) && s.length()<=9){
|
jpayne@68
|
1364 try {
|
jpayne@68
|
1365 return getNode(Integer.parseInt(s));
|
jpayne@68
|
1366 } catch (NumberFormatException e) {
|
jpayne@68
|
1367 //ignore
|
jpayne@68
|
1368 }
|
jpayne@68
|
1369 }
|
jpayne@68
|
1370 return null;
|
jpayne@68
|
1371 }
|
jpayne@68
|
1372
|
jpayne@68
|
1373 /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */
|
jpayne@68
|
1374 static long parseDelimitedNumber(String s, char delimiter){
|
jpayne@68
|
1375 if(s==null){return -1;}
|
jpayne@68
|
1376 int i=0;
|
jpayne@68
|
1377 while(i<s.length() && s.charAt(i)!=delimiter){i++;}
|
jpayne@68
|
1378 i++;
|
jpayne@68
|
1379 if(i>=s.length() || !Tools.isDigit(s.charAt(i))){return -1;}
|
jpayne@68
|
1380
|
jpayne@68
|
1381 long number=0;
|
jpayne@68
|
1382 while(i<s.length()){
|
jpayne@68
|
1383 char c=s.charAt(i);
|
jpayne@68
|
1384 if(c==delimiter || c==' ' || c=='\t'){break;}
|
jpayne@68
|
1385 assert(Tools.isDigit(c)) : c+"\n"+s;
|
jpayne@68
|
1386 number=(number*10)+(c-'0');
|
jpayne@68
|
1387 i++;
|
jpayne@68
|
1388 }
|
jpayne@68
|
1389 return number;
|
jpayne@68
|
1390 }
|
jpayne@68
|
1391
|
jpayne@68
|
1392 /** Parses sequence headers using NCBI's current header system, with Accessions. */
|
jpayne@68
|
1393 private TaxNode getNodeNewStyle(final String s){
|
jpayne@68
|
1394
|
jpayne@68
|
1395 int space=s.indexOf(' ');
|
jpayne@68
|
1396 int number=-1;
|
jpayne@68
|
1397
|
jpayne@68
|
1398 if(AccessionToTaxid.LOADED()){
|
jpayne@68
|
1399 if(space>0){
|
jpayne@68
|
1400 number=AccessionToTaxid.get(s.substring(0, space));
|
jpayne@68
|
1401 }else{
|
jpayne@68
|
1402 number=AccessionToTaxid.get(s);
|
jpayne@68
|
1403 }
|
jpayne@68
|
1404 }
|
jpayne@68
|
1405
|
jpayne@68
|
1406 if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){
|
jpayne@68
|
1407 try {
|
jpayne@68
|
1408 return getNode(Integer.parseInt(s));
|
jpayne@68
|
1409 } catch (NumberFormatException e) {
|
jpayne@68
|
1410 //ignore
|
jpayne@68
|
1411 }
|
jpayne@68
|
1412 }
|
jpayne@68
|
1413
|
jpayne@68
|
1414 if(number<0 && space>0){
|
jpayne@68
|
1415 number=parseNameToTaxid(s.substring(space+1));
|
jpayne@68
|
1416 }
|
jpayne@68
|
1417
|
jpayne@68
|
1418 if(number>-1){return getNode(number);}
|
jpayne@68
|
1419 if(space<0 && s.indexOf('_')>0){
|
jpayne@68
|
1420 return getNodeNewStyle(s.replace('_', ' '));
|
jpayne@68
|
1421 }
|
jpayne@68
|
1422 return null;
|
jpayne@68
|
1423 }
|
jpayne@68
|
1424
|
jpayne@68
|
1425 /**
|
jpayne@68
|
1426 * For parsing old-style NCBI headers.
|
jpayne@68
|
1427 */
|
jpayne@68
|
1428 public int parseAccessionToTaxid(String[] split){
|
jpayne@68
|
1429 if(split.length<4){
|
jpayne@68
|
1430 return -1;
|
jpayne@68
|
1431 }
|
jpayne@68
|
1432 int ncbi=AccessionToTaxid.get(split[3]);
|
jpayne@68
|
1433 return ncbi;
|
jpayne@68
|
1434 }
|
jpayne@68
|
1435
|
jpayne@68
|
1436 /**
|
jpayne@68
|
1437 * For parsing old-style NCBI headers.
|
jpayne@68
|
1438 */
|
jpayne@68
|
1439 public int parseHeaderNameToTaxid(String[] split){
|
jpayne@68
|
1440 if(split.length<5){
|
jpayne@68
|
1441 return -1;
|
jpayne@68
|
1442 }
|
jpayne@68
|
1443 return parseNameToTaxid(split[4]);
|
jpayne@68
|
1444 }
|
jpayne@68
|
1445
|
jpayne@68
|
1446 /**
|
jpayne@68
|
1447 * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens").
|
jpayne@68
|
1448 * If multiple nodes share the same name, returns the first; to get the full list,
|
jpayne@68
|
1449 * use getNodesByNameExtended.
|
jpayne@68
|
1450 * @param name Organism name.
|
jpayne@68
|
1451 * @return Organism TaxID, or -1 if not found.
|
jpayne@68
|
1452 */
|
jpayne@68
|
1453 public int parseNameToTaxid(String name){
|
jpayne@68
|
1454 // assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size());
|
jpayne@68
|
1455 List<TaxNode> list=null;
|
jpayne@68
|
1456
|
jpayne@68
|
1457 list=getNodesByNameExtended(name);
|
jpayne@68
|
1458
|
jpayne@68
|
1459 if(list==null || list.size()>1){return -1;}
|
jpayne@68
|
1460 return list.get(0).id;
|
jpayne@68
|
1461 }
|
jpayne@68
|
1462
|
jpayne@68
|
1463 /**
|
jpayne@68
|
1464 * Fetch nodes indicated by this name.
|
jpayne@68
|
1465 * @param name A taxonomic name delimited by space or underscore.
|
jpayne@68
|
1466 * @return Nodes corresponding to the name.
|
jpayne@68
|
1467 */
|
jpayne@68
|
1468 public List<TaxNode> getNodesByNameExtended(String name){
|
jpayne@68
|
1469 List<TaxNode> list=null;
|
jpayne@68
|
1470
|
jpayne@68
|
1471 list=getNodesByName(name);
|
jpayne@68
|
1472 if(list!=null){return list;}
|
jpayne@68
|
1473
|
jpayne@68
|
1474 name=name.replaceAll("_", " ").trim();
|
jpayne@68
|
1475 list=getNodesByName(name);
|
jpayne@68
|
1476 if(list!=null){return list;}
|
jpayne@68
|
1477
|
jpayne@68
|
1478 String[] split2=name.split(" ");
|
jpayne@68
|
1479
|
jpayne@68
|
1480 if(split2.length>7){
|
jpayne@68
|
1481 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7];
|
jpayne@68
|
1482 list=getNodesByName(term);
|
jpayne@68
|
1483 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1484 if(list!=null){return list;}
|
jpayne@68
|
1485 }
|
jpayne@68
|
1486
|
jpayne@68
|
1487 if(split2.length>6){
|
jpayne@68
|
1488 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6];
|
jpayne@68
|
1489 list=getNodesByName(term);
|
jpayne@68
|
1490 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1491 if(list!=null){return list;}
|
jpayne@68
|
1492 }
|
jpayne@68
|
1493
|
jpayne@68
|
1494 if(split2.length>5){
|
jpayne@68
|
1495 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5];
|
jpayne@68
|
1496 list=getNodesByName(term);
|
jpayne@68
|
1497 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1498 if(list!=null){return list;}
|
jpayne@68
|
1499 }
|
jpayne@68
|
1500 if(split2.length>4){
|
jpayne@68
|
1501 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4];
|
jpayne@68
|
1502 list=getNodesByName(term);
|
jpayne@68
|
1503 // System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1504 if(list!=null){return list;}
|
jpayne@68
|
1505 }
|
jpayne@68
|
1506 if(split2.length>3){
|
jpayne@68
|
1507 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3];
|
jpayne@68
|
1508 list=getNodesByName(term);
|
jpayne@68
|
1509 // System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1510 if(list!=null){return list;}
|
jpayne@68
|
1511 }
|
jpayne@68
|
1512 if(split2.length>2){
|
jpayne@68
|
1513 String term=split2[0]+" "+split2[1]+" "+split2[2];
|
jpayne@68
|
1514 list=getNodesByName(term);
|
jpayne@68
|
1515 // System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1516 if(list!=null){return list;}
|
jpayne@68
|
1517 }
|
jpayne@68
|
1518 if(split2.length>1){
|
jpayne@68
|
1519 String term=split2[0]+" "+split2[1];
|
jpayne@68
|
1520 list=getNodesByName(term);
|
jpayne@68
|
1521 // System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1522 if(list!=null){return list;}
|
jpayne@68
|
1523 }
|
jpayne@68
|
1524 if(split2.length>0){
|
jpayne@68
|
1525 String term=split2[0];
|
jpayne@68
|
1526 list=getNodesByName(term);
|
jpayne@68
|
1527 // System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list);
|
jpayne@68
|
1528 if(list!=null){return list;}
|
jpayne@68
|
1529 }
|
jpayne@68
|
1530
|
jpayne@68
|
1531 return null;
|
jpayne@68
|
1532 }
|
jpayne@68
|
1533
|
jpayne@68
|
1534 /*--------------------------------------------------------------*/
|
jpayne@68
|
1535 /*---------------- Assorted Methods ----------------*/
|
jpayne@68
|
1536 /*--------------------------------------------------------------*/
|
jpayne@68
|
1537
|
jpayne@68
|
1538 /**
|
jpayne@68
|
1539 * Return the TaxID of the lowest ancestor node at least the specified level,
|
jpayne@68
|
1540 * including this node itself. Level is the normal (non-extended) level.
|
jpayne@68
|
1541 * @param taxID
|
jpayne@68
|
1542 * @param taxLevel
|
jpayne@68
|
1543 * @return
|
jpayne@68
|
1544 */
|
jpayne@68
|
1545 public int promote(final int taxID, int taxLevel){
|
jpayne@68
|
1546 TaxNode tn=null;
|
jpayne@68
|
1547 tn=(taxID<1 ? null : getNode(taxID));
|
jpayne@68
|
1548 tn=promote(tn, taxLevel);
|
jpayne@68
|
1549 return (tn==null ? taxID : tn.id);
|
jpayne@68
|
1550 }
|
jpayne@68
|
1551
|
jpayne@68
|
1552 /**
|
jpayne@68
|
1553 * Fetch the first node in this node's lineage of at least the indicated level.
|
jpayne@68
|
1554 * This can be the node itself or an ancestor.
|
jpayne@68
|
1555 * @see getNodeAtLevelExtended
|
jpayne@68
|
1556 * @param tn Node in question
|
jpayne@68
|
1557 * @param taxLevel Desired minimum level
|
jpayne@68
|
1558 * @return A node at the desired level
|
jpayne@68
|
1559 */
|
jpayne@68
|
1560 public TaxNode promote(TaxNode tn, int taxLevel){
|
jpayne@68
|
1561 while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){
|
jpayne@68
|
1562 TaxNode temp=getNode(tn.pid);
|
jpayne@68
|
1563 if(temp==null || temp==tn || temp.level>=TaxTree.LIFE || temp.level>taxLevel){break;}
|
jpayne@68
|
1564 tn=temp;
|
jpayne@68
|
1565 }
|
jpayne@68
|
1566 return tn;
|
jpayne@68
|
1567 }
|
jpayne@68
|
1568
|
jpayne@68
|
1569 /**
|
jpayne@68
|
1570 * Determine the TaxID of the node's parent.
|
jpayne@68
|
1571 * @param id TaxID of child node
|
jpayne@68
|
1572 * @return Parent TaxID
|
jpayne@68
|
1573 */
|
jpayne@68
|
1574 public int getParentID(int id){
|
jpayne@68
|
1575 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
|
jpayne@68
|
1576 + "\nPlease redownload it and regenerate the taxtree.";
|
jpayne@68
|
1577 if(id<0 || id>=nodes.length){return -1;}
|
jpayne@68
|
1578 TaxNode tn=nodes[id];
|
jpayne@68
|
1579 if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);}
|
jpayne@68
|
1580 return tn==null ? -1 : tn.pid;
|
jpayne@68
|
1581 }
|
jpayne@68
|
1582
|
jpayne@68
|
1583 /**
|
jpayne@68
|
1584 * Fetch the node with this TaxID.
|
jpayne@68
|
1585 * @param id TaxID
|
jpayne@68
|
1586 * @return Node
|
jpayne@68
|
1587 */
|
jpayne@68
|
1588 public TaxNode getNode(int id){
|
jpayne@68
|
1589 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
|
jpayne@68
|
1590 + "\nPlease redownload it and regenerate the taxtree.";
|
jpayne@68
|
1591 if(id<0 || id>=nodes.length){return null;}
|
jpayne@68
|
1592 TaxNode tn=nodes[id];
|
jpayne@68
|
1593 if(tn!=null || mergedMap==null){return tn;}
|
jpayne@68
|
1594 return getNode(mergedMap.get(id), true);
|
jpayne@68
|
1595 }
|
jpayne@68
|
1596
|
jpayne@68
|
1597 /**
|
jpayne@68
|
1598 * Fetch the node with this TaxID, but don't throw assertions upon failure.
|
jpayne@68
|
1599 * @param id TaxID
|
jpayne@68
|
1600 * @return Node
|
jpayne@68
|
1601 */
|
jpayne@68
|
1602 public TaxNode getNode(int id, boolean skipAssertion){
|
jpayne@68
|
1603 assert(skipAssertion || id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump."
|
jpayne@68
|
1604 + "\nPlease redownload it and regenerate the taxtree.";
|
jpayne@68
|
1605 if(id<0 || id>=nodes.length){return null;}
|
jpayne@68
|
1606 TaxNode tn=nodes[id];
|
jpayne@68
|
1607 if(tn!=null || mergedMap==null){return tn;}
|
jpayne@68
|
1608 return getNode(mergedMap.get(id), true);
|
jpayne@68
|
1609 }
|
jpayne@68
|
1610
|
jpayne@68
|
1611 public TaxNode getNodeAtLevel(int id, int minLevel){
|
jpayne@68
|
1612 return getNodeAtLevel(id, minLevel, DOMAIN);
|
jpayne@68
|
1613 }
|
jpayne@68
|
1614
|
jpayne@68
|
1615 public TaxNode getNodeAtLevelExtended(int id, int minLevelE){
|
jpayne@68
|
1616 return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E);
|
jpayne@68
|
1617 }
|
jpayne@68
|
1618
|
jpayne@68
|
1619 public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){
|
jpayne@68
|
1620 final int minLevelExtended=levelToExtended(minLevel);
|
jpayne@68
|
1621 final int maxLevelExtended=levelToExtended(maxLevel);
|
jpayne@68
|
1622 return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended);
|
jpayne@68
|
1623 }
|
jpayne@68
|
1624
|
jpayne@68
|
1625 public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){
|
jpayne@68
|
1626 TaxNode tn=getNode(id);
|
jpayne@68
|
1627 while(tn!=null && tn.pid!=tn.id && tn.levelExtended<minLevelE){
|
jpayne@68
|
1628 TaxNode temp=getNode(tn.pid);
|
jpayne@68
|
1629 if(temp==null || temp.levelExtended>maxLevelE){break;}
|
jpayne@68
|
1630 tn=temp;
|
jpayne@68
|
1631 }
|
jpayne@68
|
1632 return tn;
|
jpayne@68
|
1633 }
|
jpayne@68
|
1634
|
jpayne@68
|
1635 public int getIdAtLevelExtended(int taxID, int taxLevelExtended){
|
jpayne@68
|
1636 if(taxLevelExtended<0){return taxID;}
|
jpayne@68
|
1637 TaxNode tn=getNode(taxID);
|
jpayne@68
|
1638 while(tn!=null && tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){
|
jpayne@68
|
1639 tn=getNode(tn.pid);
|
jpayne@68
|
1640 if(tn.levelExtended>taxLevelExtended){break;}
|
jpayne@68
|
1641 taxID=tn.id;
|
jpayne@68
|
1642 }
|
jpayne@68
|
1643 return taxID;
|
jpayne@68
|
1644 }
|
jpayne@68
|
1645
|
jpayne@68
|
1646 /**
|
jpayne@68
|
1647 * Fetch the node with this name.
|
jpayne@68
|
1648 * Throw an assertion if there are multiple such nodes.
|
jpayne@68
|
1649 * @param s Organism name.
|
jpayne@68
|
1650 * @return Node with given name.
|
jpayne@68
|
1651 */
|
jpayne@68
|
1652 public TaxNode getNodeByName(String s){
|
jpayne@68
|
1653 List<TaxNode> list=getNodesByName(s, false);
|
jpayne@68
|
1654 if(list==null){list=getNodesByName(s, true);}
|
jpayne@68
|
1655 if(list==null || list.isEmpty()){return null;}
|
jpayne@68
|
1656 if(list.size()==1){return list.get(0);}
|
jpayne@68
|
1657 assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n";
|
jpayne@68
|
1658 TaxNode a=list.get(0);
|
jpayne@68
|
1659 for(int i=1; i<list.size(); i++){
|
jpayne@68
|
1660 TaxNode b=list.get(i);
|
jpayne@68
|
1661 //Keep the most specific node
|
jpayne@68
|
1662 // if(a==null || (b!=null && b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf())){//not necessary
|
jpayne@68
|
1663 if(b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf()){
|
jpayne@68
|
1664 a=b;
|
jpayne@68
|
1665 }
|
jpayne@68
|
1666 }
|
jpayne@68
|
1667 return a;
|
jpayne@68
|
1668 }
|
jpayne@68
|
1669
|
jpayne@68
|
1670 /**
|
jpayne@68
|
1671 * Fetch a list of all nodes with this name.
|
jpayne@68
|
1672 * @param s Organism name.
|
jpayne@68
|
1673 * @return Nodes with given name.
|
jpayne@68
|
1674 */
|
jpayne@68
|
1675 public List<TaxNode> getNodesByName(String s){
|
jpayne@68
|
1676 List<TaxNode> list=getNodesByName(s, false);
|
jpayne@68
|
1677 if(list==null){list=getNodesByName(s, true);}
|
jpayne@68
|
1678 return list;
|
jpayne@68
|
1679 }
|
jpayne@68
|
1680
|
jpayne@68
|
1681 /**
|
jpayne@68
|
1682 * Fetch a map of names to nodes. If absent, create it first.
|
jpayne@68
|
1683 * @param lowercase If true, return the map with lowercase keys.
|
jpayne@68
|
1684 * @return Map of names to nodes.
|
jpayne@68
|
1685 */
|
jpayne@68
|
1686 private HashMap<String, ArrayList<TaxNode>> getMap(boolean lowercase){
|
jpayne@68
|
1687 HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap);
|
jpayne@68
|
1688 if(map==null){
|
jpayne@68
|
1689 synchronized(this){hashNames(true);}
|
jpayne@68
|
1690 map=(lowercase ? nameMapLower : nameMap);
|
jpayne@68
|
1691 }
|
jpayne@68
|
1692 assert(map!=null) : "Tax names were not hashed.";
|
jpayne@68
|
1693 return map;
|
jpayne@68
|
1694 }
|
jpayne@68
|
1695
|
jpayne@68
|
1696 private List<TaxNode> getNodesByName(String s, boolean lowercase){
|
jpayne@68
|
1697 if(s==null){return null;}
|
jpayne@68
|
1698 if(s.indexOf('_')>=0){s=s.replace('_', ' ');}
|
jpayne@68
|
1699 if(lowercase){s=s.toLowerCase();}
|
jpayne@68
|
1700 // System.err.println("Searching for "+s);
|
jpayne@68
|
1701 final HashMap<String, ArrayList<TaxNode>> map=getMap(lowercase);
|
jpayne@68
|
1702 ArrayList<TaxNode> list=map.get(s);
|
jpayne@68
|
1703 if(list!=null){return list;}
|
jpayne@68
|
1704 // System.err.println("No matches for '"+s+"'");
|
jpayne@68
|
1705
|
jpayne@68
|
1706 // assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s);
|
jpayne@68
|
1707
|
jpayne@68
|
1708 if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;}
|
jpayne@68
|
1709 String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8);
|
jpayne@68
|
1710 // System.err.println("Array: "+Arrays.toString(split));
|
jpayne@68
|
1711 list=map.get(split[split.length-1]);
|
jpayne@68
|
1712 if(list==null){return list;}
|
jpayne@68
|
1713 // System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")");
|
jpayne@68
|
1714
|
jpayne@68
|
1715 int matchCount=0;
|
jpayne@68
|
1716 for(TaxNode tn : list){
|
jpayne@68
|
1717 if(tn.matchesName(split, split.length-1, this)){matchCount++;}
|
jpayne@68
|
1718 }
|
jpayne@68
|
1719 if(matchCount==list.size()){return list;}
|
jpayne@68
|
1720 if(matchCount<1){return null;}
|
jpayne@68
|
1721 ArrayList<TaxNode> hits=new ArrayList<TaxNode>(matchCount);
|
jpayne@68
|
1722 for(TaxNode tn : list){
|
jpayne@68
|
1723 if(tn.matchesName(split, split.length-1, this)){hits.add(tn);}
|
jpayne@68
|
1724 }
|
jpayne@68
|
1725 return hits;
|
jpayne@68
|
1726 }
|
jpayne@68
|
1727 public ArrayList<TaxNode> getAncestors(int id){
|
jpayne@68
|
1728 TaxNode current=getNode(id);
|
jpayne@68
|
1729 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
|
jpayne@68
|
1730 while(current!=null && current.pid!=current.id){//ignores root
|
jpayne@68
|
1731 list.add(current);
|
jpayne@68
|
1732 current=getNode(current.pid);
|
jpayne@68
|
1733 }
|
jpayne@68
|
1734 //optionally add root here
|
jpayne@68
|
1735 return list;
|
jpayne@68
|
1736 }
|
jpayne@68
|
1737
|
jpayne@68
|
1738 public void increment(IntList ids, IntList counts, boolean sync){
|
jpayne@68
|
1739
|
jpayne@68
|
1740 ids.sort();
|
jpayne@68
|
1741 ids.getUniqueCounts(counts);
|
jpayne@68
|
1742
|
jpayne@68
|
1743 if(!sync){
|
jpayne@68
|
1744 for(int i=0; i<ids.size; i++){
|
jpayne@68
|
1745 int id=ids.get(i);
|
jpayne@68
|
1746 int count=counts.get(i);
|
jpayne@68
|
1747 incrementRaw(id, count);
|
jpayne@68
|
1748 }
|
jpayne@68
|
1749 }else{
|
jpayne@68
|
1750 synchronized(this){
|
jpayne@68
|
1751 for(int i=0; i<ids.size; i++){
|
jpayne@68
|
1752 int id=ids.get(i);
|
jpayne@68
|
1753 int count=counts.get(i);
|
jpayne@68
|
1754 incrementRaw(id, count);
|
jpayne@68
|
1755 }
|
jpayne@68
|
1756 }
|
jpayne@68
|
1757 }
|
jpayne@68
|
1758 }
|
jpayne@68
|
1759
|
jpayne@68
|
1760 public void incrementRaw(int id, long amt){
|
jpayne@68
|
1761 assert(id>=0 && id<nodes.length) : "TaxID "+id+" is out of range."+(id<0 ? "" : " Possibly the taxonomy data needs to be updated.");
|
jpayne@68
|
1762 assert(nodes[id]!=null) : "No node for TaxID "+id+"; possibly the taxonomy data needs to be updated.";
|
jpayne@68
|
1763 nodes[id].incrementRaw(amt);
|
jpayne@68
|
1764 }
|
jpayne@68
|
1765
|
jpayne@68
|
1766 public void percolateUp(){
|
jpayne@68
|
1767 for(int i=0; i<treeLevelsExtended.length; i++){
|
jpayne@68
|
1768 percolateUp(i);
|
jpayne@68
|
1769 }
|
jpayne@68
|
1770 }
|
jpayne@68
|
1771
|
jpayne@68
|
1772 public void percolateUp(final int fromLevel){
|
jpayne@68
|
1773 final TaxNode[] stratum=treeLevelsExtended[fromLevel];
|
jpayne@68
|
1774 for(final TaxNode n : stratum){
|
jpayne@68
|
1775 n.incrementSum(n.countRaw);
|
jpayne@68
|
1776 TaxNode parent=nodes[n.pid];
|
jpayne@68
|
1777 if(n!=parent){
|
jpayne@68
|
1778 parent.incrementSum(n.countSum);
|
jpayne@68
|
1779 }
|
jpayne@68
|
1780 }
|
jpayne@68
|
1781 }
|
jpayne@68
|
1782
|
jpayne@68
|
1783 /** Add this amount to the node and all its ancestors. */
|
jpayne@68
|
1784 public void percolateUp(TaxNode node, long amt){
|
jpayne@68
|
1785 if(amt==0){return;}
|
jpayne@68
|
1786 if(verbose){System.err.println("percolateUp("+amt+") node: "+node);}
|
jpayne@68
|
1787 while(node.id!=node.pid){
|
jpayne@68
|
1788 node.incrementSum(amt);
|
jpayne@68
|
1789 node=nodes[node.pid];
|
jpayne@68
|
1790 }
|
jpayne@68
|
1791 node.incrementSum(amt);
|
jpayne@68
|
1792 }
|
jpayne@68
|
1793
|
jpayne@68
|
1794 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){
|
jpayne@68
|
1795 return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1);
|
jpayne@68
|
1796 }
|
jpayne@68
|
1797
|
jpayne@68
|
1798 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){
|
jpayne@68
|
1799 final int minLevelExtended=levelToExtended(minLevel);
|
jpayne@68
|
1800 final int maxLevelExtended=levelToExtended(maxLevel);
|
jpayne@68
|
1801 // assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended;
|
jpayne@68
|
1802 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
|
jpayne@68
|
1803 for(int i=minLevelExtended; i<nodesPerLevelExtended.length && i<=maxLevelExtended; i++){
|
jpayne@68
|
1804 list.addAll(gatherNodesAtLeastLimitExtended(i, limit));
|
jpayne@68
|
1805 }
|
jpayne@68
|
1806 Shared.sort(list, TaxNode.countComparator);
|
jpayne@68
|
1807 return list;
|
jpayne@68
|
1808 }
|
jpayne@68
|
1809
|
jpayne@68
|
1810 public ArrayList<TaxNode> gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){
|
jpayne@68
|
1811 ArrayList<TaxNode> list=new ArrayList<TaxNode>();
|
jpayne@68
|
1812 final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended];
|
jpayne@68
|
1813 for(final TaxNode n : stratum){
|
jpayne@68
|
1814 if(n.countSum>=limit){
|
jpayne@68
|
1815 list.add(n);
|
jpayne@68
|
1816 TaxNode parent=nodes[n.pid];
|
jpayne@68
|
1817 if(n!=parent){
|
jpayne@68
|
1818 percolateUp(parent, -n.countSum);//123 This was negative for some reason
|
jpayne@68
|
1819 }
|
jpayne@68
|
1820 }
|
jpayne@68
|
1821 }
|
jpayne@68
|
1822 Shared.sort(list, TaxNode.countComparator);
|
jpayne@68
|
1823 return list;
|
jpayne@68
|
1824 }
|
jpayne@68
|
1825
|
jpayne@68
|
1826 /*--------------------------------------------------------------*/
|
jpayne@68
|
1827 /*---------------- Static Initializers ----------------*/
|
jpayne@68
|
1828 /*--------------------------------------------------------------*/
|
jpayne@68
|
1829
|
jpayne@68
|
1830 /**
|
jpayne@68
|
1831 * Generate the name to level number map.
|
jpayne@68
|
1832 */
|
jpayne@68
|
1833 private static HashMap<String, Integer> makeLevelMap() {
|
jpayne@68
|
1834 HashMap<String, Integer> map=new HashMap<String, Integer>(31);
|
jpayne@68
|
1835 for(int i=0; i<taxLevelNames.length; i++){
|
jpayne@68
|
1836 map.put(taxLevelNames[i], i);
|
jpayne@68
|
1837 map.put(taxLevelNames[i].toUpperCase(), i);
|
jpayne@68
|
1838 }
|
jpayne@68
|
1839 map.put("clade", NO_RANK);
|
jpayne@68
|
1840 map.put("clade".toUpperCase(), NO_RANK);
|
jpayne@68
|
1841 return map;
|
jpayne@68
|
1842 }
|
jpayne@68
|
1843
|
jpayne@68
|
1844 /**
|
jpayne@68
|
1845 * Generate the name to extended level number map.
|
jpayne@68
|
1846 */
|
jpayne@68
|
1847 private static HashMap<String, Integer> makeLevelMapExtended() {
|
jpayne@68
|
1848 HashMap<String, Integer> map=new HashMap<String, Integer>(129);
|
jpayne@68
|
1849 for(int i=0; i<taxLevelNamesExtended.length; i++){
|
jpayne@68
|
1850 map.put(taxLevelNamesExtended[i], i);
|
jpayne@68
|
1851 map.put(taxLevelNamesExtended[i].toUpperCase(), i);
|
jpayne@68
|
1852 }
|
jpayne@68
|
1853 map.put("clade", NO_RANK_E);
|
jpayne@68
|
1854 map.put("clade".toUpperCase(), NO_RANK_E);
|
jpayne@68
|
1855 return map;
|
jpayne@68
|
1856 }
|
jpayne@68
|
1857
|
jpayne@68
|
1858 /**
|
jpayne@68
|
1859 * I think this maps normal and extend names to normal level numbers.
|
jpayne@68
|
1860 */
|
jpayne@68
|
1861 private static HashMap<String, Integer> makeAltLevelMap() {
|
jpayne@68
|
1862 HashMap<String, Integer> map=new HashMap<String, Integer>(129);
|
jpayne@68
|
1863 for(int i=0; i<taxLevelNames.length; i++){
|
jpayne@68
|
1864 map.put(taxLevelNames[i], i);
|
jpayne@68
|
1865 map.put(taxLevelNames[i].toUpperCase(), i);
|
jpayne@68
|
1866 }
|
jpayne@68
|
1867 map.put("clade", NO_RANK);
|
jpayne@68
|
1868 map.put("clade".toUpperCase(), NO_RANK);
|
jpayne@68
|
1869
|
jpayne@68
|
1870 //Add synonyms
|
jpayne@68
|
1871 // map.put("subfamily", map.get("family"));
|
jpayne@68
|
1872 // map.put("tribe", map.get("family"));
|
jpayne@68
|
1873 // map.put("varietas", map.get("subspecies"));
|
jpayne@68
|
1874 // map.put("subgenus", map.get("genus"));
|
jpayne@68
|
1875 // map.put("forma", map.get("subspecies"));
|
jpayne@68
|
1876 // map.put("species group", map.get("genus"));
|
jpayne@68
|
1877 // map.put("species subgroup", map.get("genus"));
|
jpayne@68
|
1878 // map.put("cohort", map.get("class"));
|
jpayne@68
|
1879 // map.put("subclass", map.get("class"));
|
jpayne@68
|
1880 // map.put("infraorder", map.get("order"));
|
jpayne@68
|
1881 // map.put("superorder", map.get("class"));
|
jpayne@68
|
1882 // map.put("subphylum", map.get("phylum"));
|
jpayne@68
|
1883 // map.put("infraclass", map.get("class"));
|
jpayne@68
|
1884 // map.put("superkingdom", map.get("division"));
|
jpayne@68
|
1885 // map.put("parvorder", map.get("order"));
|
jpayne@68
|
1886 // map.put("superclass", map.get("phylum"));
|
jpayne@68
|
1887 // map.put("superphylum", map.get("kingdom"));
|
jpayne@68
|
1888 // map.put("subkingdom", map.get("kingdom"));
|
jpayne@68
|
1889 // map.put("superfamily", map.get("order"));
|
jpayne@68
|
1890 // map.put("superkingdom", map.get("domain"));
|
jpayne@68
|
1891 // map.put("suborder", map.get("order"));
|
jpayne@68
|
1892 // map.put("subtribe", map.get("family"));
|
jpayne@68
|
1893
|
jpayne@68
|
1894 for(String[] array : taxLevelNamesExtendedMatrix){
|
jpayne@68
|
1895 String head=array[array.length-1];
|
jpayne@68
|
1896 Integer value=map.get(head);
|
jpayne@68
|
1897 assert(value!=null) : head;
|
jpayne@68
|
1898 for(String key : array){
|
jpayne@68
|
1899 if(key!=head){
|
jpayne@68
|
1900 assert(!map.containsKey(key)) : "Map already contains key "+key+": "+Arrays.toString(array);
|
jpayne@68
|
1901 map.put(key, value);
|
jpayne@68
|
1902 map.put(key.toUpperCase(), value);
|
jpayne@68
|
1903 }
|
jpayne@68
|
1904 }
|
jpayne@68
|
1905 }
|
jpayne@68
|
1906
|
jpayne@68
|
1907 return map;
|
jpayne@68
|
1908 }
|
jpayne@68
|
1909
|
jpayne@68
|
1910 /*--------------------------------------------------------------*/
|
jpayne@68
|
1911 /*---------------- Size ----------------*/
|
jpayne@68
|
1912 /*--------------------------------------------------------------*/
|
jpayne@68
|
1913
|
jpayne@68
|
1914 /** Number of bp associated with this node in RefSeq */
|
jpayne@68
|
1915 public long toSize(TaxNode tn){
|
jpayne@68
|
1916 if(tn==null){return 0;}
|
jpayne@68
|
1917 if(refseqSizeMap==null){return -1L;}
|
jpayne@68
|
1918 final long x=refseqSizeMap.get(tn.id);
|
jpayne@68
|
1919 return Tools.max(0, x);
|
jpayne@68
|
1920 }
|
jpayne@68
|
1921
|
jpayne@68
|
1922 /** Number of bp associated with this node and descendants in RefSeq */
|
jpayne@68
|
1923 public long toSizeC(TaxNode tn){
|
jpayne@68
|
1924 if(tn==null){return 0;}
|
jpayne@68
|
1925 if(refseqSizeMapC==null){return -1L;}
|
jpayne@68
|
1926 final long x=refseqSizeMapC.get(tn.id);
|
jpayne@68
|
1927 return Tools.max(0, x);
|
jpayne@68
|
1928 }
|
jpayne@68
|
1929
|
jpayne@68
|
1930 /** Number of sequences associated with this node in RefSeq */
|
jpayne@68
|
1931 public int toSeqs(TaxNode tn){
|
jpayne@68
|
1932 if(tn==null){return 0;}
|
jpayne@68
|
1933 if(refseqSeqMap==null){return -1;}
|
jpayne@68
|
1934 final int x=refseqSeqMap.get(tn.id);
|
jpayne@68
|
1935 return Tools.max(0, x);
|
jpayne@68
|
1936 }
|
jpayne@68
|
1937
|
jpayne@68
|
1938 /** Number of sequences associated with this node and descandants in RefSeq */
|
jpayne@68
|
1939 public long toSeqsC(TaxNode tn){
|
jpayne@68
|
1940 if(tn==null){return 0;}
|
jpayne@68
|
1941 if(refseqSeqMapC==null){return -1L;}
|
jpayne@68
|
1942 final long x=refseqSeqMapC.get(tn.id);
|
jpayne@68
|
1943 return Tools.max(0, x);
|
jpayne@68
|
1944 }
|
jpayne@68
|
1945
|
jpayne@68
|
1946 /** Number of descendants of this node */
|
jpayne@68
|
1947 public int toNodes(TaxNode tn){
|
jpayne@68
|
1948 if(tn==null){return 0;}
|
jpayne@68
|
1949 if(nodeMapC==null){return -1;}
|
jpayne@68
|
1950 final int x=nodeMapC.get(tn.id);
|
jpayne@68
|
1951 return Tools.max(0, x);
|
jpayne@68
|
1952 }
|
jpayne@68
|
1953
|
jpayne@68
|
1954 /**
|
jpayne@68
|
1955 * Fills refseqSizeMap, refseqSizeMapC, etc. from a file containing the summary.
|
jpayne@68
|
1956 * @param fname Size file name
|
jpayne@68
|
1957 */
|
jpayne@68
|
1958 public void loadSizeFile(String fname){
|
jpayne@68
|
1959 if(fname==null){return;}
|
jpayne@68
|
1960 assert(refseqSizeMap==null);
|
jpayne@68
|
1961 refseqSizeMap=new IntLongHashMap();
|
jpayne@68
|
1962 refseqSizeMapC=new IntLongHashMap();
|
jpayne@68
|
1963 refseqSeqMap=new IntHashMap();
|
jpayne@68
|
1964 refseqSeqMapC=new IntLongHashMap();
|
jpayne@68
|
1965 nodeMapC=new IntHashMap();
|
jpayne@68
|
1966
|
jpayne@68
|
1967 final ByteFile bf=ByteFile.makeByteFile(fname, true);
|
jpayne@68
|
1968 final byte delimiter='\t';
|
jpayne@68
|
1969 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
|
jpayne@68
|
1970 if(line.length>0 && line[0]!='#'){
|
jpayne@68
|
1971 int a=0, b=0;
|
jpayne@68
|
1972
|
jpayne@68
|
1973 while(b<line.length && line[b]!=delimiter){b++;}
|
jpayne@68
|
1974 assert(b>a) : "Missing field 0: "+new String(line);
|
jpayne@68
|
1975 int tid=Parse.parseInt(line, a, b);
|
jpayne@68
|
1976 b++;
|
jpayne@68
|
1977 a=b;
|
jpayne@68
|
1978
|
jpayne@68
|
1979 while(b<line.length && line[b]!=delimiter){b++;}
|
jpayne@68
|
1980 assert(b>a) : "Missing field 1: "+new String(line);
|
jpayne@68
|
1981 long size=Parse.parseLong(line, a, b);
|
jpayne@68
|
1982 b++;
|
jpayne@68
|
1983 a=b;
|
jpayne@68
|
1984
|
jpayne@68
|
1985 while(b<line.length && line[b]!=delimiter){b++;}
|
jpayne@68
|
1986 assert(b>a) : "Missing field 2: "+new String(line);
|
jpayne@68
|
1987 long csize=Parse.parseLong(line, a, b);
|
jpayne@68
|
1988 b++;
|
jpayne@68
|
1989 a=b;
|
jpayne@68
|
1990
|
jpayne@68
|
1991 while(b<line.length && line[b]!=delimiter){b++;}
|
jpayne@68
|
1992 assert(b>a) : "Missing field 3: "+new String(line);
|
jpayne@68
|
1993 int seqs=Parse.parseInt(line, a, b);
|
jpayne@68
|
1994 b++;
|
jpayne@68
|
1995 a=b;
|
jpayne@68
|
1996
|
jpayne@68
|
1997 while(b<line.length && line[b]!=delimiter){b++;}
|
jpayne@68
|
1998 assert(b>a) : "Missing field 4: "+new String(line);
|
jpayne@68
|
1999 long cseqs=Parse.parseLong(line, a, b);
|
jpayne@68
|
2000 b++;
|
jpayne@68
|
2001 a=b;
|
jpayne@68
|
2002
|
jpayne@68
|
2003 while(b<line.length && line[b]!=delimiter){b++;}
|
jpayne@68
|
2004 assert(b>a) : "Missing field 5: "+new String(line);
|
jpayne@68
|
2005 int cnodes=Parse.parseInt(line, a, b);
|
jpayne@68
|
2006 b++;
|
jpayne@68
|
2007 a=b;
|
jpayne@68
|
2008
|
jpayne@68
|
2009 if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);}
|
jpayne@68
|
2010 if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);}
|
jpayne@68
|
2011 if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);}
|
jpayne@68
|
2012 if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);}
|
jpayne@68
|
2013 if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);}
|
jpayne@68
|
2014 }
|
jpayne@68
|
2015 }
|
jpayne@68
|
2016 bf.close();
|
jpayne@68
|
2017 }
|
jpayne@68
|
2018
|
jpayne@68
|
2019 /*--------------------------------------------------------------*/
|
jpayne@68
|
2020 /*---------------- IMG ----------------*/
|
jpayne@68
|
2021 /*--------------------------------------------------------------*/
|
jpayne@68
|
2022
|
jpayne@68
|
2023 public static int imgToTaxid(long img){
|
jpayne@68
|
2024 ImgRecord ir=imgMap.get(img);
|
jpayne@68
|
2025 // assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n";
|
jpayne@68
|
2026 return ir==null ? -1 : ir.taxID;
|
jpayne@68
|
2027 }
|
jpayne@68
|
2028
|
jpayne@68
|
2029 public TaxNode imgToTaxNode(long img){
|
jpayne@68
|
2030 int tid=imgToTaxid(img);
|
jpayne@68
|
2031 return tid<1 ? null : getNode(tid);
|
jpayne@68
|
2032 }
|
jpayne@68
|
2033
|
jpayne@68
|
2034 // public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){
|
jpayne@68
|
2035 // assert(imgMap==null);
|
jpayne@68
|
2036 // if(fname==null){return 0;}
|
jpayne@68
|
2037 // ImgRecord2.storeName=storeName;
|
jpayne@68
|
2038 // if(outstream!=null){System.err.println("Loading IMG.");}
|
jpayne@68
|
2039 // Timer t=new Timer(outstream, false);
|
jpayne@68
|
2040 // ImgRecord2[] array=ImgRecord2.toArray(fname);
|
jpayne@68
|
2041 // int x=loadIMG(array);
|
jpayne@68
|
2042 // t.stopAndPrint();
|
jpayne@68
|
2043 // return x;
|
jpayne@68
|
2044 // }
|
jpayne@68
|
2045
|
jpayne@68
|
2046 public static int loadIMG(String fname, boolean storeName, PrintStream outstream){
|
jpayne@68
|
2047 assert(imgMap==null);
|
jpayne@68
|
2048 if(fname==null){return 0;}
|
jpayne@68
|
2049 ImgRecord.storeName=storeName;
|
jpayne@68
|
2050 if(outstream!=null){System.err.println("Loading IMG.");}
|
jpayne@68
|
2051 Timer t=new Timer(outstream, false);
|
jpayne@68
|
2052 ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ);
|
jpayne@68
|
2053 int x=loadIMG(array);
|
jpayne@68
|
2054 t.stopAndPrint();
|
jpayne@68
|
2055 return x;
|
jpayne@68
|
2056 }
|
jpayne@68
|
2057
|
jpayne@68
|
2058 public static int loadIMG(ImgRecord[] array){
|
jpayne@68
|
2059 assert(imgMap==null);
|
jpayne@68
|
2060 imgMap=new HashMap<Long, ImgRecord>((int)(array.length*1.5));
|
jpayne@68
|
2061 for(ImgRecord record : array){
|
jpayne@68
|
2062 imgMap.put(record.imgID, record);
|
jpayne@68
|
2063 }
|
jpayne@68
|
2064 return imgMap.size();
|
jpayne@68
|
2065 }
|
jpayne@68
|
2066
|
jpayne@68
|
2067 @Deprecated
|
jpayne@68
|
2068 public static int parseLevel(String b){
|
jpayne@68
|
2069 final int level;
|
jpayne@68
|
2070 if(b==null){level=-1;}
|
jpayne@68
|
2071 else if(Tools.isNumeric(b.charAt(0))){
|
jpayne@68
|
2072 level=Integer.parseInt(b);
|
jpayne@68
|
2073 }else{
|
jpayne@68
|
2074 level=stringToLevel(b.toLowerCase());
|
jpayne@68
|
2075 }
|
jpayne@68
|
2076 return level;
|
jpayne@68
|
2077 }
|
jpayne@68
|
2078
|
jpayne@68
|
2079 public static int parseLevelExtended(String b){
|
jpayne@68
|
2080 final int level;
|
jpayne@68
|
2081 if(b==null){level=-1;}
|
jpayne@68
|
2082 else if(Tools.isNumeric(b.charAt(0))){
|
jpayne@68
|
2083 level=levelToExtended(Integer.parseInt(b));
|
jpayne@68
|
2084 }else{
|
jpayne@68
|
2085 level=stringToLevelExtended(b.toLowerCase());
|
jpayne@68
|
2086 }
|
jpayne@68
|
2087 return level;
|
jpayne@68
|
2088 }
|
jpayne@68
|
2089
|
jpayne@68
|
2090 public boolean isUnclassified(int tid){
|
jpayne@68
|
2091 TaxNode tn=getNode(tid);
|
jpayne@68
|
2092 while(tn!=null && tn.id!=tn.pid){
|
jpayne@68
|
2093 if(tn.isUnclassified()){return true;}
|
jpayne@68
|
2094 if(tn.pid==tn.id){break;}
|
jpayne@68
|
2095 tn=getNode(tn.pid);
|
jpayne@68
|
2096 }
|
jpayne@68
|
2097 return false;
|
jpayne@68
|
2098 }
|
jpayne@68
|
2099
|
jpayne@68
|
2100 public boolean isEnvironmentalSample(int tid){
|
jpayne@68
|
2101 TaxNode tn=getNode(tid);
|
jpayne@68
|
2102 while(tn!=null && tn.id!=tn.pid){
|
jpayne@68
|
2103 if(tn.isEnvironmentalSample()){return true;}
|
jpayne@68
|
2104 if(tn.pid==tn.id){break;}
|
jpayne@68
|
2105 tn=getNode(tn.pid);
|
jpayne@68
|
2106 }
|
jpayne@68
|
2107 return false;
|
jpayne@68
|
2108 }
|
jpayne@68
|
2109
|
jpayne@68
|
2110 public boolean isVirus(int tid){
|
jpayne@68
|
2111 TaxNode tn=getNode(tid);
|
jpayne@68
|
2112 while(tn!=null && tn.id!=tn.pid){
|
jpayne@68
|
2113 if(tn.id==VIRUSES_ID){return true;}
|
jpayne@68
|
2114 if(tn.pid==tn.id){break;}
|
jpayne@68
|
2115 tn=getNode(tn.pid);
|
jpayne@68
|
2116 }
|
jpayne@68
|
2117 return false;
|
jpayne@68
|
2118 }
|
jpayne@68
|
2119
|
jpayne@68
|
2120 public long definedLevels(int tid){
|
jpayne@68
|
2121 long levels=0;
|
jpayne@68
|
2122 TaxNode tn=getNode(tid);
|
jpayne@68
|
2123 while(tn!=null && tn.id!=tn.pid){
|
jpayne@68
|
2124 levels=levels|(1L<<tn.level);
|
jpayne@68
|
2125 }
|
jpayne@68
|
2126 return levels;
|
jpayne@68
|
2127 }
|
jpayne@68
|
2128
|
jpayne@68
|
2129 public long definedLevelsExtended(int tid){
|
jpayne@68
|
2130 long levels=0;
|
jpayne@68
|
2131 TaxNode tn=getNode(tid);
|
jpayne@68
|
2132 while(tn!=null && tn.id!=tn.pid){
|
jpayne@68
|
2133 levels=levels|(1L<<tn.levelExtended);
|
jpayne@68
|
2134 }
|
jpayne@68
|
2135 return levels;
|
jpayne@68
|
2136 }
|
jpayne@68
|
2137
|
jpayne@68
|
2138 /**
|
jpayne@68
|
2139 * Generates the mononomial name for this taxonomic level based on the scientific name.
|
jpayne@68
|
2140 * For example, "Homo sapiens" -> "Sapiens"
|
jpayne@68
|
2141 * @param tid TaxID
|
jpayne@68
|
2142 * @return Correct name for this node.
|
jpayne@68
|
2143 */
|
jpayne@68
|
2144 public String mononomial(int tid){return mononomial(getNode(tid));}
|
jpayne@68
|
2145 public String mononomial(TaxNode tn){
|
jpayne@68
|
2146 if(tn==null){return null;}
|
jpayne@68
|
2147 String name=tn.name;
|
jpayne@68
|
2148 if(name.indexOf(' ')<0){return name;}
|
jpayne@68
|
2149 TaxNode parent=getNode(tn.pid);
|
jpayne@68
|
2150 if(parent==null){return name;}
|
jpayne@68
|
2151 String pname=parent.name;
|
jpayne@68
|
2152 if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){
|
jpayne@68
|
2153 name=name.substring(pname.length()+1);
|
jpayne@68
|
2154 }
|
jpayne@68
|
2155 return name;
|
jpayne@68
|
2156 }
|
jpayne@68
|
2157
|
jpayne@68
|
2158 /*--------------------------------------------------------------*/
|
jpayne@68
|
2159 /*---------------- Fields ----------------*/
|
jpayne@68
|
2160 /*--------------------------------------------------------------*/
|
jpayne@68
|
2161
|
jpayne@68
|
2162 /** All nodes in the tree in a flat array, indexed by TaxiD */
|
jpayne@68
|
2163 public final TaxNode[] nodes;
|
jpayne@68
|
2164
|
jpayne@68
|
2165 /** Number of nodes per normal level */
|
jpayne@68
|
2166 public final int[] nodesPerLevel=new int[taxLevelNames.length];
|
jpayne@68
|
2167
|
jpayne@68
|
2168 /** Number of nodes per extended level */
|
jpayne@68
|
2169 public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length];
|
jpayne@68
|
2170
|
jpayne@68
|
2171 /** Number of nodes in the tree */
|
jpayne@68
|
2172 public final int nodeCount;
|
jpayne@68
|
2173
|
jpayne@68
|
2174 /** Maps old TaxIDs to new TaxIDs */
|
jpayne@68
|
2175 public final IntHashMap mergedMap;
|
jpayne@68
|
2176
|
jpayne@68
|
2177 /** Arrays of all nodes at a given taxonomic level (extended) */
|
jpayne@68
|
2178 public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][];
|
jpayne@68
|
2179
|
jpayne@68
|
2180 /** Map of names to nodes */
|
jpayne@68
|
2181 HashMap<String, ArrayList<TaxNode>> nameMap;
|
jpayne@68
|
2182 /** Map of lowercase names to nodes */
|
jpayne@68
|
2183 HashMap<String, ArrayList<TaxNode>> nameMapLower;
|
jpayne@68
|
2184 /** Map of nodes to child nodes */
|
jpayne@68
|
2185 HashMap<TaxNode, ArrayList<TaxNode>> childMap;
|
jpayne@68
|
2186 public HashMap<String, ArrayList<TaxNode>> nameMap(){return nameMap;}
|
jpayne@68
|
2187
|
jpayne@68
|
2188 @Deprecated
|
jpayne@68
|
2189 public int minValidTaxa=0; //TODO: Remove (will break serialization)
|
jpayne@68
|
2190
|
jpayne@68
|
2191 /** Infer ranks for no-rank nodes, when possible */
|
jpayne@68
|
2192 public boolean simplify=true;
|
jpayne@68
|
2193 /** See simplify() for details, works in conjunction with simplify */
|
jpayne@68
|
2194 public boolean reassign=true;
|
jpayne@68
|
2195 /** Discard no-rank nodes */
|
jpayne@68
|
2196 public boolean skipNorank=false;
|
jpayne@68
|
2197 public int inferRankLimit=0;//levelMap.get("species");
|
jpayne@68
|
2198
|
jpayne@68
|
2199 //Node Statistics
|
jpayne@68
|
2200 /** Number of bases assigned to this TaxID in RefSeq */
|
jpayne@68
|
2201 private IntLongHashMap refseqSizeMap;
|
jpayne@68
|
2202 /** Number of bases assigned to this TaxID and descendants in RefSeq */
|
jpayne@68
|
2203 private IntLongHashMap refseqSizeMapC;
|
jpayne@68
|
2204 /** Number of sequences assigned to this TaxID in RefSeq */
|
jpayne@68
|
2205 private IntHashMap refseqSeqMap;
|
jpayne@68
|
2206 /** Number of sequences assigned to this TaxID and descendants in RefSeq */
|
jpayne@68
|
2207 private IntLongHashMap refseqSeqMapC;
|
jpayne@68
|
2208 /** Number of descendant nodes, inclusive, for each TaxID */
|
jpayne@68
|
2209 private IntHashMap nodeMapC;
|
jpayne@68
|
2210
|
jpayne@68
|
2211 /*--------------------------------------------------------------*/
|
jpayne@68
|
2212 /*---------------- Statics ----------------*/
|
jpayne@68
|
2213 /*--------------------------------------------------------------*/
|
jpayne@68
|
2214
|
jpayne@68
|
2215 /** Assign levels to unranked nodes below species level, when possible */
|
jpayne@68
|
2216 public static boolean assignStrains=true;
|
jpayne@68
|
2217 /** Assume headers are in Silva format */
|
jpayne@68
|
2218 public static boolean SILVA_MODE=false;
|
jpayne@68
|
2219 /** Assume headers are in Unite format */
|
jpayne@68
|
2220 public static boolean UNITE_MODE=false;
|
jpayne@68
|
2221 /** Probably unnecessary at this point... present for legacy reasons */
|
jpayne@68
|
2222 public static boolean CRASH_IF_NO_GI_TABLE=true;
|
jpayne@68
|
2223
|
jpayne@68
|
2224 public static boolean verbose=false;
|
jpayne@68
|
2225 public static boolean SHOW_WARNINGS=false;
|
jpayne@68
|
2226
|
jpayne@68
|
2227 /** Maps IMG IDs to records from the dump file */
|
jpayne@68
|
2228 private static HashMap<Long, ImgRecord> imgMap;
|
jpayne@68
|
2229
|
jpayne@68
|
2230 /** Set to false if the tree is expected to be mutated.
|
jpayne@68
|
2231 * @TODO Remove mutable fields from the tree (like counters).
|
jpayne@68
|
2232 */
|
jpayne@68
|
2233 public static boolean ALLOW_SHARED_TREE=true;
|
jpayne@68
|
2234
|
jpayne@68
|
2235 /** Universal location of the shared TaxTree used by various classes */
|
jpayne@68
|
2236 private static TaxTree sharedTree;
|
jpayne@68
|
2237
|
jpayne@68
|
2238 /** A simpler and probably less safe version of sharedTree(...) */
|
jpayne@68
|
2239 public static TaxTree getTree(){return sharedTree;}
|
jpayne@68
|
2240
|
jpayne@68
|
2241 /**
|
jpayne@68
|
2242 * Fetch the shared tree, loading it from file if not present.
|
jpayne@68
|
2243 * @return A tree.
|
jpayne@68
|
2244 * @TODO: Check proper-construction of double-checked synchronize
|
jpayne@68
|
2245 */
|
jpayne@68
|
2246 private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) {
|
jpayne@68
|
2247 if(!ALLOW_SHARED_TREE){return null;}
|
jpayne@68
|
2248 if(sharedTree==null && fname!=null){
|
jpayne@68
|
2249 if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();}
|
jpayne@68
|
2250 synchronized(TaxTree.class){
|
jpayne@68
|
2251 if(sharedTree==null){
|
jpayne@68
|
2252 if(outstream!=null){outstream.println("Loading tax tree.");}
|
jpayne@68
|
2253 Timer t=new Timer(outstream, false);
|
jpayne@68
|
2254 setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat);
|
jpayne@68
|
2255 t.stopAndPrint();
|
jpayne@68
|
2256 }
|
jpayne@68
|
2257 }
|
jpayne@68
|
2258 }
|
jpayne@68
|
2259 if(hashNames && sharedTree.nameMap==null){
|
jpayne@68
|
2260 synchronized(sharedTree){
|
jpayne@68
|
2261 if(sharedTree.nameMap==null){
|
jpayne@68
|
2262 if(outstream!=null){outstream.println("Hashing names.");}
|
jpayne@68
|
2263 Timer t=new Timer(outstream, false);
|
jpayne@68
|
2264 sharedTree.hashNames(hashDotFormat);
|
jpayne@68
|
2265 t.stopAndPrint();
|
jpayne@68
|
2266 }
|
jpayne@68
|
2267 }
|
jpayne@68
|
2268 }
|
jpayne@68
|
2269 return sharedTree;
|
jpayne@68
|
2270 }
|
jpayne@68
|
2271
|
jpayne@68
|
2272 /**
|
jpayne@68
|
2273 * For initialization. Normally only one tree is needed by a process so it is set here.
|
jpayne@68
|
2274 * If the tree is already set nothing will happen, unless additional hashing is needed.
|
jpayne@68
|
2275 */
|
jpayne@68
|
2276 private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){
|
jpayne@68
|
2277 assert(ALLOW_SHARED_TREE);
|
jpayne@68
|
2278 assert(sharedTree==null);
|
jpayne@68
|
2279 sharedTree=tree;
|
jpayne@68
|
2280 if(hashNames && sharedTree.nameMap==null){
|
jpayne@68
|
2281 synchronized(sharedTree){
|
jpayne@68
|
2282 if(sharedTree.nameMap==null){
|
jpayne@68
|
2283 sharedTree.hashNames(hashDotFormat);
|
jpayne@68
|
2284 }
|
jpayne@68
|
2285 }
|
jpayne@68
|
2286 }
|
jpayne@68
|
2287 }
|
jpayne@68
|
2288
|
jpayne@68
|
2289 /**
|
jpayne@68
|
2290 * Determine whether a taxonomic level is standard. e.g.:<br>
|
jpayne@68
|
2291 * isSimple("phylum")=true<br>
|
jpayne@68
|
2292 * isSimple("subphylum")=false<br>
|
jpayne@68
|
2293 * isSimple("no-rank")=false
|
jpayne@68
|
2294 * @param levelExtended The extended level to test.
|
jpayne@68
|
2295 * @return True if this level is not no-rank, and the names of the normal and extended levels match.
|
jpayne@68
|
2296 */
|
jpayne@68
|
2297 public static boolean isSimple(int levelExtended){
|
jpayne@68
|
2298 int level=extendedToLevel(levelExtended);
|
jpayne@68
|
2299 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level));
|
jpayne@68
|
2300 }
|
jpayne@68
|
2301
|
jpayne@68
|
2302 /**
|
jpayne@68
|
2303 * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:<br>
|
jpayne@68
|
2304 * isSimple("phylum")=true<br>
|
jpayne@68
|
2305 * isSimple("substrain")=true<br>
|
jpayne@68
|
2306 * isSimple("subphylum")=false<br>
|
jpayne@68
|
2307 * isSimple("no-rank")=false
|
jpayne@68
|
2308 * @param levelExtended The extended level to test.
|
jpayne@68
|
2309 * @return True if this level is not no-rank, and the names of the normal and extended levels match.
|
jpayne@68
|
2310 */
|
jpayne@68
|
2311 public static boolean isSimple2(int levelExtended){
|
jpayne@68
|
2312 int level=extendedToLevel(levelExtended);
|
jpayne@68
|
2313 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)
|
jpayne@68
|
2314 || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E);
|
jpayne@68
|
2315 }
|
jpayne@68
|
2316
|
jpayne@68
|
2317 /*--------------------------------------------------------------*/
|
jpayne@68
|
2318 /*---------------- Constants ----------------*/
|
jpayne@68
|
2319 /*--------------------------------------------------------------*/
|
jpayne@68
|
2320
|
jpayne@68
|
2321 /** Get the number for the normal level of this name */
|
jpayne@68
|
2322 public static final int stringToLevel(String s){return altLevelMap.get(s);}
|
jpayne@68
|
2323 public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);}
|
jpayne@68
|
2324 /** Get the number for the extended level of this name */
|
jpayne@68
|
2325 public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);}
|
jpayne@68
|
2326 /** Get the normal name for this normal level */
|
jpayne@68
|
2327 public static final String levelToString(int x){return taxLevelNames[x];}
|
jpayne@68
|
2328 /** Get the extended name for this extended level */
|
jpayne@68
|
2329 public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];}
|
jpayne@68
|
2330 /** Get the abbreviated name for this normal level */
|
jpayne@68
|
2331 public static final String levelToStringShort(int x){return taxLevelNamesShort[x];}
|
jpayne@68
|
2332
|
jpayne@68
|
2333 /** Normal, aka canonical, aka simple tax level names */
|
jpayne@68
|
2334 private static final String[] taxLevelNames=new String[] {
|
jpayne@68
|
2335 "no rank", "subspecies", "species", "genus",
|
jpayne@68
|
2336 "family", "order", "class", "phylum",
|
jpayne@68
|
2337 "kingdom", "superkingdom", "domain", "life"
|
jpayne@68
|
2338 };
|
jpayne@68
|
2339 public static final int numTaxLevelNames=taxLevelNames.length;
|
jpayne@68
|
2340
|
jpayne@68
|
2341 /**
|
jpayne@68
|
2342 * Definitive representation of all NCBI taxonomic level names.
|
jpayne@68
|
2343 * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash.
|
jpayne@68
|
2344 * The first dimension maps normal ranks to extended ranks.
|
jpayne@68
|
2345 * Both dimensions are ordered ascending.
|
jpayne@68
|
2346 * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels().
|
jpayne@68
|
2347 */
|
jpayne@68
|
2348 //TODO See @TODO
|
jpayne@68
|
2349 private static final String[][] taxLevelNamesExtendedMatrix=new String[][] {
|
jpayne@68
|
2350 {"no rank"},
|
jpayne@68
|
2351 {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup",
|
jpayne@68
|
2352 "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas",
|
jpayne@68
|
2353 "subspecies"},
|
jpayne@68
|
2354 {"species"},
|
jpayne@68
|
2355 {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"},
|
jpayne@68
|
2356 {"subtribe", "tribe", "subfamily", "family"},
|
jpayne@68
|
2357 {"superfamily", "parvorder", "infraorder", "suborder", "order"},
|
jpayne@68
|
2358 {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"},
|
jpayne@68
|
2359 {"superclass", "subdivision", "division", "subphylum", "phylum"},
|
jpayne@68
|
2360 {"superphylum", "subkingdom", "kingdom"},
|
jpayne@68
|
2361 {"superkingdom"},
|
jpayne@68
|
2362 {"domain"},
|
jpayne@68
|
2363 {"life"}
|
jpayne@68
|
2364 };
|
jpayne@68
|
2365
|
jpayne@68
|
2366 /** Extended tax level names as a 1D array */
|
jpayne@68
|
2367 private static final String[] taxLevelNamesExtended=makeNamesExtended();
|
jpayne@68
|
2368 /** Number of extended tax levels */
|
jpayne@68
|
2369 public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length;
|
jpayne@68
|
2370
|
jpayne@68
|
2371 /** Flatten the extended tax level names matrix to a 1D array */
|
jpayne@68
|
2372 private static final String[] makeNamesExtended(){
|
jpayne@68
|
2373 ArrayList<String> list=new ArrayList<String>();
|
jpayne@68
|
2374 for(String[] s : taxLevelNamesExtendedMatrix){
|
jpayne@68
|
2375 for(String ss : s){
|
jpayne@68
|
2376 list.add(ss);
|
jpayne@68
|
2377 }
|
jpayne@68
|
2378 }
|
jpayne@68
|
2379 return list.toArray(new String[0]);
|
jpayne@68
|
2380 }
|
jpayne@68
|
2381
|
jpayne@68
|
2382 /** Abbreviations of tax level names, mainly for semicolon form */
|
jpayne@68
|
2383 private static final String[] taxLevelNamesShort=new String[] {
|
jpayne@68
|
2384 "nr", "ss", "s", "g",
|
jpayne@68
|
2385 "f", "o", "c", "p",
|
jpayne@68
|
2386 "k", "sk", "d", "l"
|
jpayne@68
|
2387 };
|
jpayne@68
|
2388
|
jpayne@68
|
2389 /** Normal tax level numbers as constants */
|
jpayne@68
|
2390 public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3,
|
jpayne@68
|
2391 FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11;
|
jpayne@68
|
2392
|
jpayne@68
|
2393 /** TaxID of Life node */
|
jpayne@68
|
2394 public static final int LIFE_ID=1;
|
jpayne@68
|
2395 /** TaxID of Cellular Organisms node */
|
jpayne@68
|
2396 public static final int CELLULAR_ORGANISMS_ID=131567;
|
jpayne@68
|
2397 /** TaxID of Bacteria node */
|
jpayne@68
|
2398 public static final int BACTERIA_ID=2; //Is this safe? Who knows...
|
jpayne@68
|
2399 /** TaxID of Archaea node */
|
jpayne@68
|
2400 public static final int ARCHAEA_ID=2157;
|
jpayne@68
|
2401 /** TaxID of Euk node */
|
jpayne@68
|
2402 public static final int EUKARYOTA_ID=2759;
|
jpayne@68
|
2403 /** TaxID of Animal node */
|
jpayne@68
|
2404 public static final int METAZOA_ID=33208, ANIMALIA_ID=33208;
|
jpayne@68
|
2405 /** TaxID of Plant node */
|
jpayne@68
|
2406 public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090;
|
jpayne@68
|
2407 /** TaxID of Fungi node */
|
jpayne@68
|
2408 public static final int FUNGI_ID=4751;
|
jpayne@68
|
2409 /** TaxID of Virus node */
|
jpayne@68
|
2410 public static final int VIRUSES_ID=10239;
|
jpayne@68
|
2411 /** TaxID of Viroids node (now defunct) */
|
jpayne@68
|
2412 public static final int VIROIDS_ID=12884;
|
jpayne@68
|
2413
|
jpayne@68
|
2414 /** Maps normal level names to normal level numbers */
|
jpayne@68
|
2415 private static final HashMap<String, Integer> levelMap=makeLevelMap();
|
jpayne@68
|
2416 /** Maps extended level names to extended level numbers */
|
jpayne@68
|
2417 private static final HashMap<String, Integer> levelMapExtended=makeLevelMapExtended();
|
jpayne@68
|
2418 /** Maps extended level names to normal level numbers */
|
jpayne@68
|
2419 private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap();
|
jpayne@68
|
2420
|
jpayne@68
|
2421 /** Common extended level numbers as constants */
|
jpayne@68
|
2422 public static final int NO_RANK_E=NO_RANK,
|
jpayne@68
|
2423 SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"),
|
jpayne@68
|
2424 SUBSPECIES_E=stringToLevelExtended("subspecies"),
|
jpayne@68
|
2425 SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"),
|
jpayne@68
|
2426 FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"),
|
jpayne@68
|
2427 CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"),
|
jpayne@68
|
2428 KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"),
|
jpayne@68
|
2429 DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life");
|
jpayne@68
|
2430
|
jpayne@68
|
2431 /** Map of normal to extended level numbers */
|
jpayne@68
|
2432 private static final int[] levelToExtended=new int[] {
|
jpayne@68
|
2433 NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E,
|
jpayne@68
|
2434 ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E
|
jpayne@68
|
2435 };
|
jpayne@68
|
2436
|
jpayne@68
|
2437 /** Map of extended to normal level numbers */
|
jpayne@68
|
2438 private static final int[] extendedToLevel=makeExtendedToLevel();
|
jpayne@68
|
2439
|
jpayne@68
|
2440 /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */
|
jpayne@68
|
2441 private static int[] makeExtendedToLevel(){
|
jpayne@68
|
2442 int len=0;
|
jpayne@68
|
2443 for(String[] array : taxLevelNamesExtendedMatrix){
|
jpayne@68
|
2444 len+=array.length;
|
jpayne@68
|
2445 }
|
jpayne@68
|
2446 int[] ret=new int[len];
|
jpayne@68
|
2447
|
jpayne@68
|
2448 int pos=0;
|
jpayne@68
|
2449 for(int level=0; level<taxLevelNamesExtendedMatrix.length; level++){
|
jpayne@68
|
2450 String[] array=taxLevelNamesExtendedMatrix[level];
|
jpayne@68
|
2451 for(int i=0; i<array.length; i++){
|
jpayne@68
|
2452 ret[pos]=level;
|
jpayne@68
|
2453 pos++;
|
jpayne@68
|
2454 }
|
jpayne@68
|
2455 }
|
jpayne@68
|
2456 return ret;
|
jpayne@68
|
2457 }
|
jpayne@68
|
2458
|
jpayne@68
|
2459 /** Convert a standard level number (like KINGDOM) to extended (like KINGDOM_E). */
|
jpayne@68
|
2460 public static final int levelToExtended(int level){
|
jpayne@68
|
2461 return level<0 ? level : levelToExtended[level];
|
jpayne@68
|
2462 }
|
jpayne@68
|
2463
|
jpayne@68
|
2464 /** Convert an extended level number (like PHYLUM_E) to extended (like PHYLUM).
|
jpayne@68
|
2465 * Non-standard levels will be converted to the next higher standard level;
|
jpayne@68
|
2466 * e.g., subphylum -> phylum */
|
jpayne@68
|
2467 public static final int extendedToLevel(int extended){
|
jpayne@68
|
2468 return extended<0 ? -1 : extendedToLevel[extended];
|
jpayne@68
|
2469 }
|
jpayne@68
|
2470
|
jpayne@68
|
2471 /* Pre-compiled delimiters to save time when splitting lines */
|
jpayne@68
|
2472 private static final Pattern delimiterTab = Pattern.compile("\t");
|
jpayne@68
|
2473 private static final Pattern delimiter = Pattern.compile("\t\\|\t");
|
jpayne@68
|
2474 private static final Pattern delimiterPipe = Pattern.compile("\\|");
|
jpayne@68
|
2475 private static final Pattern delimiterTilde = Pattern.compile("\\~");
|
jpayne@68
|
2476 private static final Pattern delimiter2 = Pattern.compile("[\\s_]+");
|
jpayne@68
|
2477
|
jpayne@68
|
2478 public static boolean IMG_HQ=false;
|
jpayne@68
|
2479
|
jpayne@68
|
2480 /* For these fields, see the corresponding functions, below.
|
jpayne@68
|
2481 * They define the default paths to various data on NERSC. */
|
jpayne@68
|
2482
|
jpayne@68
|
2483 private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest";
|
jpayne@68
|
2484 private static final String defaultTaxPathAws="/test1/tax/latest";
|
jpayne@68
|
2485 private static final String defaultTaxPathIGBVM="/data/tax/latest";
|
jpayne@68
|
2486 private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
|
jpayne@68
|
2487 private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
|
jpayne@68
|
2488 private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz";
|
jpayne@68
|
2489 private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz";
|
jpayne@68
|
2490 private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz";
|
jpayne@68
|
2491 private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz";
|
jpayne@68
|
2492
|
jpayne@68
|
2493 private static final String defaultImgFile="TAX_PATH/imgDump.txt";
|
jpayne@68
|
2494 private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz";
|
jpayne@68
|
2495 private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz";
|
jpayne@68
|
2496 private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz";
|
jpayne@68
|
2497 private static final String defaultPatternFile="TAX_PATH/patterns.txt";
|
jpayne@68
|
2498 private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz";
|
jpayne@68
|
2499
|
jpayne@68
|
2500 private static final String defaultAccessionFile=
|
jpayne@68
|
2501 //"TAX_PATH/shrunk.protF.accession2taxid.gz," +
|
jpayne@68
|
2502 "TAX_PATH/shrunk.prot.accession2taxid.gz,"
|
jpayne@68
|
2503 + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz,"
|
jpayne@68
|
2504 + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz,"
|
jpayne@68
|
2505 + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz,"
|
jpayne@68
|
2506 // + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz,"
|
jpayne@68
|
2507 + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz,"
|
jpayne@68
|
2508 // + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz,"
|
jpayne@68
|
2509 + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz,"
|
jpayne@68
|
2510 + "TAX_PATH/shrunk.pdb.accession2taxid.gz";
|
jpayne@68
|
2511
|
jpayne@68
|
2512 /** For setting TAX_PATH, the root to taxonomy files */
|
jpayne@68
|
2513 public static final String defaultTaxPath(){
|
jpayne@68
|
2514 return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc;
|
jpayne@68
|
2515 }
|
jpayne@68
|
2516
|
jpayne@68
|
2517 /** 16S consensus sequences per TaxID */
|
jpayne@68
|
2518 public static final String default16SFile(){
|
jpayne@68
|
2519 return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc;
|
jpayne@68
|
2520 }
|
jpayne@68
|
2521
|
jpayne@68
|
2522 /** 18S consensus sequences per TaxID */
|
jpayne@68
|
2523 public static final String default18SFile(){
|
jpayne@68
|
2524 return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc;
|
jpayne@68
|
2525 }
|
jpayne@68
|
2526
|
jpayne@68
|
2527 /** Path to all taxonomy files, substituted in to make specific file paths */
|
jpayne@68
|
2528 public static String TAX_PATH=defaultTaxPath();
|
jpayne@68
|
2529
|
jpayne@68
|
2530 /** Location of gitable.int2d.gz for gi lookups */
|
jpayne@68
|
2531 public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);}
|
jpayne@68
|
2532 /** Location of tree.taxtree.gz */
|
jpayne@68
|
2533 public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);}
|
jpayne@68
|
2534
|
jpayne@68
|
2535 //Use the prot.FULL.gz ncbi file.
|
jpayne@68
|
2536 public static boolean protFull=false;
|
jpayne@68
|
2537
|
jpayne@68
|
2538 /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */
|
jpayne@68
|
2539 public static final String defaultAccessionFile(){
|
jpayne@68
|
2540 String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile;
|
jpayne@68
|
2541 return s.replaceAll("TAX_PATH", TAX_PATH);
|
jpayne@68
|
2542 }
|
jpayne@68
|
2543 /** Location of patterns.txt, which holds information about observed accession string formats */
|
jpayne@68
|
2544 public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);}
|
jpayne@68
|
2545 /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */
|
jpayne@68
|
2546 public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);}
|
jpayne@68
|
2547 /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */
|
jpayne@68
|
2548 public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);}
|
jpayne@68
|
2549
|
jpayne@68
|
2550 /** Screen output gets printed here */
|
jpayne@68
|
2551 private static PrintStream outstream=System.out;
|
jpayne@68
|
2552
|
jpayne@68
|
2553 }
|