Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/TaxTree.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package tax; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.io.Serializable; | |
5 import java.util.ArrayList; | |
6 import java.util.Arrays; | |
7 import java.util.HashMap; | |
8 import java.util.LinkedHashMap; | |
9 import java.util.List; | |
10 import java.util.regex.Pattern; | |
11 | |
12 import fileIO.ByteFile; | |
13 import fileIO.ReadWrite; | |
14 import fileIO.TextFile; | |
15 import shared.Parse; | |
16 import shared.Parser; | |
17 import shared.PreParser; | |
18 import shared.Shared; | |
19 import shared.Timer; | |
20 import shared.Tools; | |
21 import structures.ByteBuilder; | |
22 import structures.IntHashMap; | |
23 import structures.IntList; | |
24 import structures.IntLongHashMap; | |
25 | |
26 /** | |
27 * Represents a taxonomic tree. | |
28 * Usually just one of these needs to be created for a process. | |
29 * Designed for NCBI's taxdmp.zip file contents. | |
30 * @author Brian Bushnell | |
31 * @date Mar 6, 2015 | |
32 * | |
33 */ | |
34 public class TaxTree implements Serializable{ | |
35 | |
36 /** | |
37 * | |
38 */ | |
39 private static final long serialVersionUID = 5894416521711540017L; | |
40 | |
41 /*--------------------------------------------------------------*/ | |
42 /*---------------- Main ----------------*/ | |
43 /*--------------------------------------------------------------*/ | |
44 | |
45 /** | |
46 * Code entrance from the command line. | |
47 * This is not called normally, only when converting NCBI text files | |
48 * into a binary representation and writing it to disk. | |
49 * @param args Command line arguments | |
50 */ | |
51 public static void main(String[] args){ | |
52 | |
53 {//Preparse block for help, config files, and outstream | |
54 PreParser pp=new PreParser(args, outstream, null, false); | |
55 args=pp.args; | |
56 outstream=pp.outstream; | |
57 } | |
58 | |
59 assert(args.length>=4) : "TaxTree syntax:\ntaxtree.sh names.dmp nodes.dmp merged.dmp tree.taxtree.gz\n"; | |
60 ReadWrite.USE_UNPIGZ=true; | |
61 ReadWrite.USE_PIGZ=true; | |
62 ReadWrite.ZIPLEVEL=(Shared.threads()>2 ? 11 : 9); | |
63 ReadWrite.PIGZ_BLOCKSIZE=256; | |
64 ReadWrite.PIGZ_ITERATIONS=60; | |
65 | |
66 Timer t=new Timer(); | |
67 TaxTree tree=new TaxTree(args); | |
68 t.stop(); | |
69 | |
70 outstream.println("Retained "+tree.nodeCount+" nodes:"); | |
71 | |
72 for(int i=tree.treeLevelsExtended.length-1; i>=0; i--){ | |
73 outstream.print(tree.nodesPerLevelExtended[i]+"\t"+taxLevelNamesExtended[i]); | |
74 if(verbose){ | |
75 int lim=10; | |
76 for(int j=0; j<lim && j<tree.treeLevelsExtended[i].length; j++){ | |
77 TaxNode n=tree.treeLevelsExtended[i][j]; | |
78 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); | |
79 } | |
80 for(int j=tree.treeLevelsExtended[i].length-lim; j<tree.treeLevelsExtended[i].length; j++){ | |
81 if(j>=lim){ | |
82 TaxNode n=tree.treeLevelsExtended[i][j]; | |
83 outstream.print("\n"+n+" -> "+tree.nodes[n.pid]); | |
84 } | |
85 } | |
86 } | |
87 outstream.println(); | |
88 } | |
89 | |
90 | |
91 outstream.println(); | |
92 outstream.println("Time: \t"+t); | |
93 | |
94 if(args.length>2){//Write a tree | |
95 ReadWrite.write(tree, args[3], true); | |
96 } | |
97 } | |
98 | |
99 /** Parse arguments from the command line */ | |
100 private Parser parse(String[] args){ | |
101 | |
102 //Create a parser object | |
103 Parser parser=new Parser(); | |
104 | |
105 //Set any necessary Parser defaults here | |
106 //parser.foo=bar; | |
107 | |
108 //Parse each argument | |
109 for(int i=0; i<args.length; i++){ | |
110 String arg=args[i]; | |
111 | |
112 //Break arguments into their constituent parts, in the form of "a=b" | |
113 String[] split=arg.split("="); | |
114 String a=split[0].toLowerCase(); | |
115 String b=split.length>1 ? split[1] : null; | |
116 if(b!=null && b.equalsIgnoreCase("null")){b=null;} | |
117 | |
118 if(a.equals("verbose")){ | |
119 verbose=Parse.parseBoolean(b); | |
120 }else if(a.equals("parse_flag_goes_here")){ | |
121 long fake_variable=Parse.parseKMG(b); | |
122 //Set a variable here | |
123 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
124 //do nothing | |
125 }else if(i>3){ | |
126 outstream.println("Unknown parameter "+args[i]); | |
127 assert(false) : "Unknown parameter "+args[i]; | |
128 } | |
129 } | |
130 | |
131 return parser; | |
132 } | |
133 | |
134 /*--------------------------------------------------------------*/ | |
135 /*---------------- Initialization ----------------*/ | |
136 /*--------------------------------------------------------------*/ | |
137 | |
138 /*--------------------------------------------------------------*/ | |
139 /*---------------- Constructors ----------------*/ | |
140 /*--------------------------------------------------------------*/ | |
141 | |
142 /** | |
143 * Constructor using filenames from command line arguments, in the format of: | |
144 * {names, nodes, merged} | |
145 * @param args Command line arguments | |
146 */ | |
147 private TaxTree(String[] args){ | |
148 this(args[0], args[1], args[2], args); | |
149 } | |
150 | |
151 /** | |
152 * @param namesFile NCBI names.txt | |
153 * @param nodesFile NCBI nodes.txt | |
154 * @param mergedFile NCBI merged.txt | |
155 * @param args | |
156 */ | |
157 private TaxTree(String namesFile, String nodesFile, String mergedFile, String[] args){ | |
158 | |
159 if(args!=null) { | |
160 Parser parser=parse(args); | |
161 } | |
162 | |
163 nodes=getNames(namesFile); | |
164 getNodes(nodesFile, nodes); | |
165 | |
166 mergedMap=getMerged(mergedFile); | |
167 | |
168 countChildren(); | |
169 outstream.println("Counted children."); | |
170 int rounds=percolate(); | |
171 outstream.println("Percolated "+rounds+" rounds to fixpoint."); | |
172 | |
173 if(assignStrains){ | |
174 assignStrains(); | |
175 rounds=percolate(); | |
176 outstream.println("Percolated "+rounds+" rounds to fixpoint."); | |
177 } | |
178 | |
179 if(simplify){ | |
180 if(verbose){outstream.println("Simplifying.");} | |
181 int removed=simplify(nodes); | |
182 if(verbose){outstream.println("Removed "+removed+" nodes.");} | |
183 rounds=percolate(); | |
184 outstream.println("Percolated "+rounds+" rounds to fixpoint."); | |
185 } | |
186 int errors=test(nodes); | |
187 // assert(errors==0); //Not possible since the tree is wrong. | |
188 if(errors>0) { | |
189 System.err.println("Found "+errors+" errors in tree."); | |
190 } | |
191 | |
192 for(TaxNode n : nodes){ | |
193 if(n!=null){ | |
194 nodesPerLevel[n.level]++; | |
195 nodesPerLevelExtended[n.levelExtended]++; | |
196 } | |
197 } | |
198 | |
199 // for(int i=0; i<nodesPerLevel.length; i++){ | |
200 // treeLevels[i]=new TaxNode[nodesPerLevel[i]]; | |
201 // } | |
202 for(int i=0; i<nodesPerLevelExtended.length; i++){ | |
203 treeLevelsExtended[i]=new TaxNode[nodesPerLevelExtended[i]]; | |
204 } | |
205 | |
206 // { | |
207 // int[] temp=new int[nodesPerLevel.length]; | |
208 // for(TaxNode n : nodes){ | |
209 // if(n!=null){ | |
210 // int level=n.level; | |
211 // treeLevels[level][temp[level]]=n; | |
212 // temp[level]++; | |
213 // } | |
214 // } | |
215 // } | |
216 | |
217 { | |
218 int[] temp=new int[nodesPerLevelExtended.length]; | |
219 for(TaxNode n : nodes){ | |
220 if(n!=null){ | |
221 int level=n.levelExtended; | |
222 treeLevelsExtended[level][temp[level]]=n; | |
223 temp[level]++; | |
224 } | |
225 } | |
226 } | |
227 nodeCount=(int)Tools.sum(nodesPerLevelExtended); | |
228 | |
229 } | |
230 | |
231 /*--------------------------------------------------------------*/ | |
232 /*--------------- Loaders ----------------*/ | |
233 /*--------------------------------------------------------------*/ | |
234 | |
235 | |
236 /** | |
237 * Load a tax tree from disk. | |
238 * @param taxTreeFile Serialized TaxTree. | |
239 * @param hashNames Hash nodes using names as keys | |
240 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens | |
241 * @return | |
242 */ | |
243 public static final TaxTree loadTaxTree(String taxTreeFile, PrintStream outstream, boolean hashNames, | |
244 boolean hashDotFormat){ | |
245 if(taxTreeFile==null){return null;} | |
246 return loadTaxTree(taxTreeFile, null, null, null, outstream, hashNames, hashDotFormat); | |
247 } | |
248 | |
249 /** | |
250 * Load a tax tree from disk, either from a binary tree file, | |
251 * or from NCBI text files. | |
252 * @param taxTreeFile Binary representation; mutually exclusive with other files. | |
253 * @param taxNameFile NCBI names.txt | |
254 * @param taxNodeFile NCBI nodes.txt | |
255 * @param taxMergedFile NCBI merged.txt | |
256 * @param hashNames Hash nodes using names as keys | |
257 * @param hashDotFormat Hash nodes using abbreviations, e.g. H.sapiens | |
258 * @return The loaded tree | |
259 */ | |
260 public static final TaxTree loadTaxTree(String taxTreeFile, String taxNameFile, String taxNodeFile, | |
261 String taxMergedFile, PrintStream outstream, boolean hashNames, boolean hashDotFormat){ | |
262 if(taxTreeFile!=null || taxNodeFile==null){ | |
263 TaxTree tree=sharedTree(taxTreeFile, hashNames, hashDotFormat, outstream); | |
264 if(tree!=null){return tree;} | |
265 } | |
266 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=defaultTreeFile();} | |
267 assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files."; | |
268 Timer t=new Timer(); | |
269 if(outstream!=null){outstream.print("\nLoading tax tree; ");} | |
270 final TaxTree tree; | |
271 if(taxTreeFile!=null){ | |
272 tree=ReadWrite.read(TaxTree.class, taxTreeFile, true); | |
273 }else{ | |
274 tree=new TaxTree(taxNameFile, taxNodeFile, taxMergedFile, null); | |
275 } | |
276 t.stop(); | |
277 if(hashNames){ | |
278 if(outstream!=null){outstream.println("Hashing names.");} | |
279 tree.hashNames(hashDotFormat); | |
280 } | |
281 if(outstream!=null){ | |
282 outstream.println("Time: \t"+t); | |
283 Shared.printMemory(); | |
284 outstream.println(); | |
285 } | |
286 if(ALLOW_SHARED_TREE){sharedTree=tree;} | |
287 return tree; | |
288 } | |
289 | |
290 /*--------------------------------------------------------------*/ | |
291 /*--------------- Constructor Helpers ----------------*/ | |
292 /*--------------------------------------------------------------*/ | |
293 | |
294 /** | |
295 * Finds unranked nodes in the archaeal and bacterial kingdoms. | |
296 * If these are below species level, have a ranked parent, | |
297 * and have no ranked children, they are assigned strain or substrain. | |
298 */ | |
299 private void assignStrains(){ | |
300 | |
301 outstream.println("Assigning strains."); | |
302 int strains=0, substrains=0; | |
303 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed yet | |
304 TaxNode archaea=getNode(ARCHAEA_ID); | |
305 assert(bacteria.name.equalsIgnoreCase("Bacteria")); | |
306 assert(archaea.name.equalsIgnoreCase("Archaea")); | |
307 | |
308 ArrayList<TaxNode> bactList=new ArrayList<TaxNode>(); | |
309 ArrayList<TaxNode> archList=new ArrayList<TaxNode>(); | |
310 for(TaxNode tn : nodes){ | |
311 if(tn!=null && tn.originalLevel()==NO_RANK && tn.minParentLevelExtended<=SPECIES_E){ | |
312 if(descendsFrom(tn, bacteria)){ | |
313 bactList.add(tn); | |
314 }else if(descendsFrom(tn, archaea)){ | |
315 archList.add(tn); | |
316 } | |
317 } | |
318 } | |
319 | |
320 ArrayList<TaxNode> prokList=new ArrayList<TaxNode>(bactList.size()+archList.size()); | |
321 prokList.addAll(bactList); | |
322 prokList.addAll(archList); | |
323 | |
324 for(TaxNode tn : prokList){ | |
325 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ | |
326 TaxNode parent=nodes[tn.pid]; | |
327 if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ | |
328 tn.levelExtended=STRAIN_E; | |
329 tn.level=SUBSPECIES; | |
330 tn.setOriginalLevel(STRAIN_E); | |
331 strains++; | |
332 } | |
333 } | |
334 } | |
335 | |
336 // outstream.println("Assigned "+strains+" strains."); | |
337 for(TaxNode tn : prokList){ | |
338 if(tn.maxDescendantLevelIncludingSelf()==NO_RANK){ | |
339 TaxNode parent=nodes[tn.pid]; | |
340 if(parent.levelExtended==STRAIN_E){ | |
341 tn.levelExtended=SUBSTRAIN_E; | |
342 tn.level=SUBSPECIES; | |
343 tn.setOriginalLevel(SUBSTRAIN_E); | |
344 substrains++; | |
345 } | |
346 } | |
347 } | |
348 // outstream.println("Assigned "+substrains+" substrains."); | |
349 } | |
350 | |
351 @Deprecated | |
352 private void assignStrainsOld(){ | |
353 | |
354 outstream.println("Assigning strains."); | |
355 int strains=0, substrains=0; | |
356 TaxNode bacteria=getNode(BACTERIA_ID); //Can't do a name lookup since the names are not hashed | |
357 assert(bacteria.name.equalsIgnoreCase("Bacteria")); | |
358 for(TaxNode tn : nodes){ | |
359 if(tn!=null && tn.originalLevel()==NO_RANK){ | |
360 TaxNode parent=nodes[tn.pid]; | |
361 if(parent.levelExtended==SPECIES_E && commonAncestor(parent, bacteria)==bacteria){ | |
362 // nodesPerLevelExtended[STRAIN_E]++; | |
363 // nodesPerLevelExtended[tn.levelExtended]--; | |
364 tn.levelExtended=STRAIN_E; | |
365 tn.level=SUBSPECIES; | |
366 tn.setOriginalLevel(STRAIN_E); | |
367 strains++; | |
368 } | |
369 } | |
370 } | |
371 // outstream.println("Assigned "+strains+" strains."); | |
372 for(TaxNode tn : nodes){ | |
373 if(tn!=null && tn.originalLevel()==NO_RANK){ | |
374 TaxNode parent=nodes[tn.pid]; | |
375 if(parent.levelExtended==STRAIN_E && commonAncestor(parent, bacteria)==bacteria){ | |
376 // nodesPerLevelExtended[SUBSTRAIN_E]++; | |
377 // nodesPerLevelExtended[tn.levelExtended]--; | |
378 tn.levelExtended=SUBSTRAIN_E; | |
379 tn.level=SUBSPECIES; | |
380 tn.setOriginalLevel(SUBSTRAIN_E); | |
381 substrains++; | |
382 } | |
383 } | |
384 } | |
385 // outstream.println("Assigned "+substrains+" substrains."); | |
386 } | |
387 | |
388 /*--------------------------------------------------------------*/ | |
389 /*---------------- Construction ----------------*/ | |
390 /*--------------------------------------------------------------*/ | |
391 | |
392 /** | |
393 * Create tax nodes using names in the designated file. | |
394 * @param fname NCBI names.txt | |
395 * @return Array of created nodes, where array[x] contains the node with TaxID x. | |
396 */ | |
397 private static TaxNode[] getNames(String fname){ | |
398 ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000); | |
399 int max=0; | |
400 | |
401 TextFile tf=new TextFile(fname, false); | |
402 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ | |
403 if(s.contains("scientific name")){ | |
404 String[] split=delimiter.split(s, 3); | |
405 assert(split.length==3) : s; | |
406 int id=Integer.parseInt(split[0]); | |
407 String name=split[1]; | |
408 if(id==1 && name.equalsIgnoreCase("root")){name="Life";} | |
409 max=Tools.max(max, id); | |
410 list.add(new TaxNode(id, name)); | |
411 } | |
412 } | |
413 | |
414 TaxNode[] nodes=new TaxNode[max+1]; | |
415 for(TaxNode n : list){ | |
416 assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n; | |
417 nodes[n.id]=n; | |
418 } | |
419 | |
420 return nodes; | |
421 } | |
422 | |
423 /** | |
424 * Parses names file a second time to fill in additional information. | |
425 * Should really be merged into getNames. | |
426 * @TODO Merge into getNames | |
427 */ | |
428 private static TaxNode[] getNodes(String fname, TaxNode[] nodes){ | |
429 | |
430 int max=0; | |
431 | |
432 LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>(); | |
433 | |
434 TextFile tf=new TextFile(fname, false); | |
435 for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ | |
436 String[] split=delimiter.split(s, 4); | |
437 assert(split.length==4) : s; | |
438 int id=-1, pid=-1, level=-1, levelExtended=-1; | |
439 | |
440 id=Integer.parseInt(split[0]); | |
441 try { | |
442 pid=Integer.parseInt(split[1]); | |
443 } catch (NumberFormatException e) { | |
444 // TODO Auto-generated catch block | |
445 e.printStackTrace(); | |
446 System.err.println("Bad line: "+s+"\n"+Arrays.toString(split)); | |
447 } | |
448 boolean alt=false; | |
449 { | |
450 String key=split[2]; | |
451 Integer obj0=levelMap.get(key); | |
452 Integer obj=levelMapExtended.get(key); | |
453 assert(obj!=null) : "No level found for "+key+"; line="+Arrays.toString(split); | |
454 | |
455 if(obj0==null){ | |
456 obj0=altLevelMap.get(key); | |
457 alt=true; | |
458 } | |
459 if(obj0!=null){ | |
460 level=obj0; | |
461 levelExtended=obj; | |
462 if(id==pid){ | |
463 level=LIFE; | |
464 levelExtended=LIFE_E; | |
465 alt=false; | |
466 } | |
467 }else{ | |
468 if(id==pid){ | |
469 level=LIFE; | |
470 levelExtended=LIFE_E; | |
471 alt=false; | |
472 }else{ | |
473 int[] count=oddNames.get(key); | |
474 if(count==null){ | |
475 count=new int[1]; | |
476 oddNames.put(key, count); | |
477 } | |
478 count[0]++; | |
479 } | |
480 } | |
481 } | |
482 max=Tools.max(max, id); | |
483 TaxNode n=nodes[id]; | |
484 assert(n!=null && n.pid<0) : n+" -> "+s; | |
485 n.pid=pid; | |
486 n.level=level; | |
487 n.levelExtended=levelExtended; | |
488 n.setOriginalLevel(levelExtended); | |
489 n.setCanonical(!alt); | |
490 assert(n.canonical()==n.isSimple() || n.levelExtended==NO_RANK_E) : n.canonical()+", "+n.isSimple()+", "+n.level+", "+n.levelExtended+"\n"+n.toString()+"\n"; | |
491 } | |
492 | |
493 if(oddNames.size()>0){ | |
494 outstream.println("Found "+oddNames.size()+" unknown taxonomic levels:"); | |
495 if(verbose){ | |
496 for(String s : oddNames.keySet()){ | |
497 outstream.println(oddNames.get(s)[0]+"\t"+s); | |
498 } | |
499 } | |
500 } | |
501 | |
502 return nodes; | |
503 } | |
504 | |
505 /** | |
506 * Count child nodes of each node. | |
507 * This can be used to size arrays or determine which nodes are leaves. | |
508 */ | |
509 private void countChildren(){ | |
510 for(TaxNode child : nodes){ | |
511 if(child!=null && child.pid!=child.id){ | |
512 TaxNode parent=getNode(child.pid); | |
513 if(parent!=child){parent.numChildren++;} | |
514 } | |
515 } | |
516 } | |
517 | |
518 //TODO - This could be finished in 2 passes using the childTable. | |
519 /** | |
520 * Fill derived fields minParentLevelExtended and maxChildLevelExtended | |
521 * by percolating information through the tree until a fixpoint is reached. | |
522 * @TODO This could be finished in 2 passes using the childTable. | |
523 * @return Number of rounds required to reach fixpoint. | |
524 */ | |
525 private int percolate(){ | |
526 boolean changed=true; | |
527 int rounds=0; | |
528 while(changed){ | |
529 changed=false; | |
530 rounds++; | |
531 for(TaxNode child : nodes){ | |
532 if(child!=null && child.pid!=child.id){ | |
533 TaxNode parent=getNode(child.pid); | |
534 changed=(child.discussWithParent(parent) | changed); | |
535 } | |
536 } | |
537 | |
538 if(!changed){break;} | |
539 changed=false; | |
540 rounds++; | |
541 for(int i=nodes.length-1; i>=0; i--){ | |
542 TaxNode child=nodes[i]; | |
543 if(child!=null && child.pid!=child.id){ | |
544 TaxNode parent=getNode(child.pid); | |
545 changed=(child.discussWithParent(parent) | changed); | |
546 } | |
547 } | |
548 } | |
549 return rounds; | |
550 } | |
551 | |
552 /** | |
553 * Load nodes into the nameMap and nameMapLower, mapped to their names. | |
554 * @param genusDotSpecies Also hash abbreviations such as E.coli. | |
555 */ | |
556 public synchronized void hashNames(boolean genusDotSpecies){ | |
557 if(nameMap!=null){return;} | |
558 assert(nameMap==null); | |
559 assert(nameMapLower==null); | |
560 final int size=((int)Tools.mid(2, (nodes.length+(genusDotSpecies ? nodesPerLevelExtended[SPECIES_E] : 0))*1.5, Shared.MAX_ARRAY_LEN)); | |
561 nameMap=new HashMap<String, ArrayList<TaxNode>>(size); | |
562 nameMapLower=new HashMap<String, ArrayList<TaxNode>>(size); | |
563 | |
564 //Hash the names, both lowercase and uppercase | |
565 for(TaxNode n : nodes){ | |
566 if(n!=null){ | |
567 String name=n.name; | |
568 if(name.indexOf('_')>=0){ | |
569 name=name.replace('_', ' ').trim(); | |
570 } | |
571 if(name!=null && !name.equals("environmental samples")){ | |
572 { | |
573 ArrayList<TaxNode> list=nameMap.get(name); | |
574 if(list==null){ | |
575 list=new ArrayList<TaxNode>(); | |
576 nameMap.put(name, list); | |
577 } | |
578 list.add(n); | |
579 } | |
580 { | |
581 String lc=name.toLowerCase(); | |
582 ArrayList<TaxNode> list=nameMapLower.get(lc); | |
583 if(list==null){ | |
584 list=new ArrayList<TaxNode>(); | |
585 nameMapLower.put(lc, list); | |
586 } | |
587 list.add(n); | |
588 } | |
589 } | |
590 } | |
591 } | |
592 | |
593 //Hash G.species versions of the names, both lowercase and uppercase | |
594 if(genusDotSpecies){ | |
595 ByteBuilder bb=new ByteBuilder(64); | |
596 for(TaxNode n : nodes){ | |
597 if(n!=null && n.levelExtended==SPECIES_E){ | |
598 String name=n.name; | |
599 if(name.indexOf('_')>=0){ | |
600 name=name.replace('_', ' ').trim(); | |
601 } | |
602 if(name!=null && !name.equals("environmental samples")){ | |
603 final String dotFormat=dotFormat(name, bb); | |
604 if(dotFormat!=null){ | |
605 { | |
606 ArrayList<TaxNode> list=nameMap.get(dotFormat); | |
607 if(list==null){ | |
608 list=new ArrayList<TaxNode>(); | |
609 nameMap.put(dotFormat, list); | |
610 } | |
611 list.add(n); | |
612 } | |
613 { | |
614 String lc=dotFormat.toLowerCase(); | |
615 ArrayList<TaxNode> list=nameMapLower.get(lc); | |
616 if(list==null){ | |
617 list=new ArrayList<TaxNode>(); | |
618 nameMapLower.put(lc, list); | |
619 } | |
620 list.add(n); | |
621 } | |
622 } | |
623 } | |
624 } | |
625 } | |
626 } | |
627 } | |
628 | |
629 /** | |
630 * Generate the "dot format" name of a node. | |
631 * For example, transform "Homo sapiens" to "H.sapiens" | |
632 * @param name Node name | |
633 * @param buffer A ByteBuilder that may be modified | |
634 * @return Dot format | |
635 */ | |
636 private static String dotFormat(String name, ByteBuilder buffer){ | |
637 if(name==null || name.indexOf('.')>=0){return null;} | |
638 final int firstSpace=name.indexOf(' '); | |
639 if(firstSpace<0 || firstSpace>=name.length()-1){return null;} | |
640 final int lastSpace=name.lastIndexOf(' '); | |
641 if(firstSpace!=lastSpace){return null;} | |
642 final String a=name.substring(0, firstSpace); | |
643 final String b=name.substring(lastSpace+1); | |
644 final char ca=a.charAt(0); | |
645 final char cb=b.charAt(0); | |
646 if(!Tools.isUpperCase(ca) || !Tools.isLowerCase(cb)){return null;} | |
647 if(buffer==null){buffer=new ByteBuilder(2+b.length());} | |
648 else{buffer.clear();} | |
649 buffer.append(ca).append('.').append(b); | |
650 return buffer.toString(); | |
651 } | |
652 | |
653 /** | |
654 * Fill childMap, which maps nodes to their children. | |
655 */ | |
656 public synchronized void hashChildren(){ | |
657 assert(childMap==null); | |
658 int nodesWithChildren=0; | |
659 for(TaxNode tn : nodes){ | |
660 if(tn!=null && tn.numChildren>0){nodesWithChildren++;} | |
661 } | |
662 childMap=new HashMap<TaxNode, ArrayList<TaxNode>>((int)Tools.mid(2, nodesWithChildren*1.5, Shared.MAX_ARRAY_LEN)); | |
663 for(TaxNode tn : nodes){ | |
664 if(tn!=null){ | |
665 if(tn.numChildren>0){ | |
666 childMap.put(tn, new ArrayList<TaxNode>(tn.numChildren)); | |
667 } | |
668 } | |
669 } | |
670 for(TaxNode tn : nodes){ | |
671 if(tn!=null){ | |
672 if(tn.id!=tn.pid){ | |
673 ArrayList<TaxNode> list=childMap.get(getNode(tn.pid)); | |
674 if(list!=null){list.add(tn);} | |
675 } | |
676 } | |
677 } | |
678 } | |
679 | |
680 /** | |
681 * Fetch this node's children. | |
682 * @param parent Node in question | |
683 * @return List of child nodes | |
684 */ | |
685 public ArrayList<TaxNode> getChildren(TaxNode parent){ | |
686 if(parent.numChildren<1){return null;} | |
687 if(childMap!=null){return childMap.get(parent);} | |
688 ArrayList<TaxNode> list=new ArrayList<TaxNode>(parent.numChildren); | |
689 for(TaxNode tn : nodes){ | |
690 if(tn!=null && tn.id!=tn.pid && tn.pid==parent.id){ | |
691 list.add(tn); | |
692 } | |
693 } | |
694 return list; | |
695 } | |
696 | |
697 /** | |
698 * Load a map of old to new TaxIDs. | |
699 * @param mergedFile NCBI merged.txt. | |
700 * @return Map of old to new TaxIDs | |
701 */ | |
702 private static IntHashMap getMerged(String mergedFile) { | |
703 if(mergedFile==null){return null;} | |
704 String[] lines=TextFile.toStringLines(mergedFile); | |
705 if(lines.length<1){return null;} | |
706 IntHashMap map=new IntHashMap((int)(lines.length*1.3)); | |
707 for(String line : lines){ | |
708 String[] split=delimiterTab.split(line); | |
709 int a=Integer.parseInt(split[0]); | |
710 int b=Integer.parseInt(split[2]); | |
711 map.put(a, b); | |
712 } | |
713 return map; | |
714 } | |
715 | |
716 /** | |
717 * Simplify the tree by assigning ranks to unranked nodes, | |
718 * where possible, through inference. | |
719 * Optionally removes unranked nodes based on the skipNorank field. | |
720 * @param nodes Array of all TaxNodes. | |
721 * @return Number of nodes removed. | |
722 */ | |
723 private int simplify(TaxNode nodes[]){ | |
724 | |
725 int failed=test(nodes); | |
726 | |
727 int removed=0; | |
728 int reassigned=0; | |
729 | |
730 if(reassign){ | |
731 boolean changed=true; | |
732 int changedCount=0; | |
733 while(changed){ | |
734 changed=false; | |
735 for(int i=0; i<nodes.length; i++){ | |
736 TaxNode n=nodes[i]; | |
737 if(n!=null && n.levelExtended<1){ | |
738 int pid=n.pid; | |
739 TaxNode parent=nodes[pid]; | |
740 assert(parent!=null) : n; | |
741 if(n.maxDescendantLevelIncludingSelf()<SUBSPECIES_E && | |
742 parent.minAncestorLevelIncludingSelf()<=SPECIES_E && parent.minAncestorLevelIncludingSelf()>=SUBSPECIES_E){ | |
743 // if(parent.levelExtended==SPECIES_E || parent.levelExtended==SUBSPECIES_E){ | |
744 changed=true; | |
745 n.levelExtended=SUBSPECIES_E; | |
746 n.level=SUBSPECIES; | |
747 changedCount++; | |
748 } | |
749 } | |
750 } | |
751 } | |
752 System.err.println("Assigned levels to "+changedCount+" unranked nodes."); | |
753 } | |
754 | |
755 | |
756 if(skipNorank){//Skip nodes with unknown taxa | |
757 if(verbose){outstream.println("A0");} | |
758 | |
759 for(int i=0; i<nodes.length; i++){ | |
760 TaxNode n=nodes[i]; | |
761 if(n!=null){ | |
762 int pid=n.pid; | |
763 TaxNode parent=nodes[pid]; | |
764 assert(parent!=null) : n; | |
765 assert(parent!=n || pid==1) : n+", "+pid; | |
766 while(parent.levelExtended<1 && n.levelExtended>parent.levelExtended){ | |
767 //System.err.println("Reassigned from "+parent); | |
768 assert(parent.id!=parent.pid); | |
769 parent=nodes[parent.pid]; | |
770 n.pid=parent.id; | |
771 reassigned++; | |
772 } | |
773 } | |
774 } | |
775 | |
776 for(int i=0; i<nodes.length; i++){ | |
777 if(nodes[i]!=null && nodes[i].levelExtended<0){ | |
778 System.err.println("Removed "+nodes[i]); | |
779 nodes[i]=null; | |
780 removed++; | |
781 } | |
782 } | |
783 if(verbose){outstream.println("Skipped "+reassigned+" unranked parents, removed "+removed+" invalid nodes.");} | |
784 } | |
785 | |
786 if(inferRankLimit>0){//Infer level for unset nodes (from "no rank") | |
787 if(verbose){outstream.println("A");} | |
788 int changed=1; | |
789 while(changed>0){ | |
790 changed=0; | |
791 for(final TaxNode n : nodes){ | |
792 if(n!=null){ | |
793 if(n.levelExtended==0){ | |
794 TaxNode parent=nodes[n.pid]; | |
795 if(n!=parent && parent.levelExtended>0 && parent.levelExtended<=inferRankLimit+1){ | |
796 n.levelExtended=Tools.max(1, parent.levelExtended-1); | |
797 assert(n.levelExtended>0 && n.levelExtended<=parent.levelExtended && n.levelExtended<=inferRankLimit); | |
798 changed++; | |
799 } | |
800 } | |
801 } | |
802 } | |
803 if(verbose){outstream.println("changed: "+changed);} | |
804 } | |
805 | |
806 // outstream.println("B"); | |
807 // for(TaxNode n : nodes){ | |
808 // if(n!=null && n.level==0){ | |
809 // n.level=-1; | |
810 // } | |
811 // } | |
812 } | |
813 | |
814 failed=test(nodes); | |
815 | |
816 // if(reassign){//Skip nodes with duplicate taxa | |
817 // if(verbose){outstream.println("D");} | |
818 // int changed=1; | |
819 // while(changed>0){ | |
820 // changed=0; | |
821 // for(final TaxNode n : nodes){ | |
822 // if(n!=null){ | |
823 // TaxNode parent=nodes[n.pid]; | |
824 // TaxNode grandparent=nodes[parent.pid]; | |
825 // assert(n.level<=parent.level || parent.level<1 || !parent.canonical()) : n+" -> "+parent+" -> "+grandparent; | |
826 // assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical()) : n+" -> "+parent+" -> "+grandparent; | |
827 // | |
828 // while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical()) || | |
829 // n.level>parent.level || (n.level==parent.level))){ | |
830 // parent=grandparent; | |
831 // grandparent=nodes[parent.pid]; | |
832 // n.pid=parent.id; | |
833 // reassigned++; | |
834 // changed++; | |
835 // } | |
836 // } | |
837 // } | |
838 // if(verbose){outstream.println("changed: "+changed);} | |
839 // } | |
840 // if(verbose){outstream.println("E");} | |
841 // for(int i=0; i<nodes.length; i++){ | |
842 // if(nodes[i]!=null && nodes[i].level<0){ | |
843 // nodes[i]=null; | |
844 // removed++; | |
845 // } | |
846 // } | |
847 // } | |
848 | |
849 failed=test(nodes); | |
850 | |
851 if(verbose){outstream.println("F");} | |
852 {//Ensure the tree is now clean | |
853 for(int i=0; i<nodes.length; i++){ | |
854 TaxNode n=nodes[i]; | |
855 if(n!=null){ | |
856 TaxNode parent=nodes[n.pid]; | |
857 TaxNode grandparent=nodes[parent.pid]; | |
858 assert(n==parent || n.levelExtended<=parent.levelExtended || !n.canonical() || n.levelExtended<1 || parent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; | |
859 assert(parent==grandparent || parent.levelExtended<=grandparent.levelExtended || !parent.canonical() || parent.levelExtended<1 || grandparent.levelExtended<1) : n+" -> "+parent+" -> "+grandparent; | |
860 } | |
861 } | |
862 } | |
863 | |
864 // if(verbose){System.err.println("Reassignments: "+reassigned);} | |
865 | |
866 return removed; | |
867 } | |
868 | |
869 /*--------------------------------------------------------------*/ | |
870 /*---------------- Validation ----------------*/ | |
871 /*--------------------------------------------------------------*/ | |
872 | |
873 /** | |
874 * Ensure tree has monotonically increasing (or nondescending) ranks. | |
875 * @param nodes All TaxNodes. | |
876 * @return Number of violations. | |
877 */ | |
878 private static int test(TaxNode[] nodes){ | |
879 int failed=0; | |
880 for(final TaxNode n : nodes){ | |
881 if(n!=null){ | |
882 TaxNode parent=nodes[n.pid]; | |
883 try { | |
884 assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical()) : | |
885 "\n"+n+" -> "+parent+", level="+n.level+", plevel="+parent.level+", pcanon="+parent.canonical()+"\n" | |
886 + "levelE="+n.levelExtended+", plevelE="+parent.levelExtended; | |
887 assert(n==parent || n.levelExtended<=parent.levelExtended || parent.levelExtended<1) : n+" -> "+parent; | |
888 // assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical() || !parent.canonical()) : n+" -> "+parent; | |
889 if(n!=parent && n.level>parent.level && parent.level>=1 && n.canonical() && parent.canonical()){ | |
890 if(verbose){outstream.println("Error: "+n+" -> "+parent);} | |
891 failed++; | |
892 }else if(n!=parent && parent.levelExtended>=1 && n.levelExtended>=parent.levelExtended){ | |
893 // if(verbose){outstream.println("Error: "+n+" -> "+parent);} | |
894 // failed++; | |
895 } | |
896 assert(n!=parent || n.id<=1) : n; | |
897 } catch (Throwable e) { | |
898 // TODO Auto-generated catch block | |
899 e.printStackTrace(); | |
900 failed++; | |
901 } | |
902 } | |
903 } | |
904 if(verbose || failed>0){outstream.println(failed+" nodes failed.");} | |
905 return failed; | |
906 } | |
907 | |
908 /*--------------------------------------------------------------*/ | |
909 /*---------------- Print Methods ----------------*/ | |
910 /*--------------------------------------------------------------*/ | |
911 | |
912 /** | |
913 * Format full name in semicolon format, e.g. | |
914 * "SK:Bacteria;P:Protobacteria;..." | |
915 * @param tn0 Base node | |
916 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. | |
917 * @return Resultant String | |
918 */ | |
919 public String toSemicolon(final TaxNode tn0, boolean skipNonCanonical, boolean mononomial){ | |
920 StringBuilder sb=new StringBuilder(); | |
921 if(tn0==null){return "Not found";} | |
922 String semi=""; | |
923 ArrayList<TaxNode> list=toAncestors(tn0, skipNonCanonical); | |
924 boolean addTaxLevel=true; | |
925 for(int i=list.size()-1; i>=0; i--){ | |
926 sb.append(semi); | |
927 TaxNode tn=list.get(i); | |
928 if(tn.id!=LIFE_ID || list.size()==1){ | |
929 if(addTaxLevel && tn.canonical() && !tn.levelChanged() && tn.isSimple()){ | |
930 sb.append(tn.levelToStringShort()).append(':'); | |
931 } | |
932 sb.append(mononomial ? mononomial(tn) : tn.name); | |
933 semi=";"; | |
934 } | |
935 } | |
936 return sb.toString(); | |
937 } | |
938 | |
939 /** | |
940 * Return a list of TaxIDs of all ancestors. | |
941 * @param tn0 Base node | |
942 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. | |
943 * @return List of TaxIDs. | |
944 */ | |
945 public IntList toAncestorIds(final TaxNode tn0, boolean skipNonCanonical){ | |
946 if(tn0==null){return null;} | |
947 IntList list=new IntList(8); | |
948 | |
949 TaxNode tn=tn0; | |
950 while(tn!=null){ | |
951 if(!skipNonCanonical || tn.isSimple()){ | |
952 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn.id);} | |
953 } | |
954 if(tn.pid==tn.id){break;} | |
955 tn=getNode(tn.pid); | |
956 } | |
957 if(list.isEmpty()){list.add(tn0.id);} | |
958 return list; | |
959 } | |
960 | |
961 /** | |
962 * Return a list of all ancestors. | |
963 * @param tn0 Base node | |
964 * @param skipNonCanonical Ignore noncanonical (aka "nonsimple") levels like Tribe. | |
965 * @return List of ancestor nodes. | |
966 */ | |
967 public ArrayList<TaxNode> toAncestors(final TaxNode tn0, boolean skipNonCanonical){ | |
968 if(tn0==null){return null;} | |
969 ArrayList<TaxNode> list=new ArrayList<TaxNode>(8); | |
970 | |
971 TaxNode tn=tn0; | |
972 while(tn!=null){ | |
973 if(!skipNonCanonical || tn.isSimple()){ | |
974 if(tn.id!=CELLULAR_ORGANISMS_ID || tn==tn0){list.add(tn);} | |
975 } | |
976 if(tn.pid==tn.id){break;} | |
977 tn=getNode(tn.pid); | |
978 } | |
979 if(list.isEmpty()){list.add(tn0);} | |
980 return list; | |
981 } | |
982 | |
983 /** | |
984 * Generate a path to the genome of an organism on the filesystem; | |
985 * used by ExplodeTree. Intended for internal JGI use. | |
986 * @param root Location of the exploded tree. | |
987 * @return Path to a genome. | |
988 */ | |
989 public String toDir(TaxNode node, String root){ | |
990 StringBuilder sb=new StringBuilder(); | |
991 if(root==null){root="";} | |
992 sb.append(root); | |
993 if(root.length()>0 && !root.endsWith("/")){sb.append('/');} | |
994 IntList list=toAncestorIds(node, false); | |
995 list.reverse(); | |
996 assert(list.get(0)==1) : list + "," +getNode(list.get(0)); | |
997 for(int i=0; i<list.size(); i++){ | |
998 sb.append(list.get(i)); | |
999 sb.append('/'); | |
1000 } | |
1001 return sb.toString(); | |
1002 } | |
1003 | |
1004 /*--------------------------------------------------------------*/ | |
1005 /*---------------- Outer Methods ----------------*/ | |
1006 /*--------------------------------------------------------------*/ | |
1007 | |
1008 /** | |
1009 * Use various techniques to get a TaxID from an unknown String, such as parsing, | |
1010 * name lookups, and accession translation. | |
1011 * @param s String to process. | |
1012 * @return Decoded TaxID. | |
1013 */ | |
1014 public static int getID(String s){return GiToTaxid.getID(s);} | |
1015 | |
1016 /** | |
1017 * Use various techniques to get a TaxID from an unknown byte[], such as parsing, | |
1018 * name lookups, and accession translation. | |
1019 * @param s String to process. | |
1020 * @return Decoded TaxID. | |
1021 */ | |
1022 public static int getID(byte[] s){return GiToTaxid.getID(s);} | |
1023 | |
1024 /** Return the lowest ancestor of the named node with taxonomic level at least minLevel */ | |
1025 public TaxNode getNode(String s, int minLevelExtended){ | |
1026 TaxNode tn=parseNodeFromHeader(s, true); | |
1027 while(tn!=null && tn.levelExtended<minLevelExtended && tn.pid!=tn.id){ | |
1028 tn=getNode(tn.pid); | |
1029 } | |
1030 return tn; | |
1031 } | |
1032 | |
1033 /** | |
1034 * Determine whether a node is a descendant of another. | |
1035 * @param child Possible child TaxID. | |
1036 * @param parent Possible parent TaxID. | |
1037 * @return true iff child descends from parent. | |
1038 */ | |
1039 public boolean descendsFrom(final int child, final int parent){ | |
1040 TaxNode cn=getNode(child), pn=getNode(parent); | |
1041 assert(cn!=null) : "Invalid taxID: "+child; | |
1042 assert(pn!=null) : "Invalid taxID: "+parent; | |
1043 return descendsFrom(cn, pn); | |
1044 } | |
1045 | |
1046 /** | |
1047 * Determine whether a node is a descendant of another. | |
1048 * @param child Possible child node. | |
1049 * @param parent Possible parent node. | |
1050 * @return true iff child descends from parent. | |
1051 */ | |
1052 public boolean descendsFrom(TaxNode child, TaxNode parent){ | |
1053 assert(child!=null && parent!=null) : "Null parameters."; | |
1054 if(child==null || parent==null){return false;} | |
1055 | |
1056 while(child!=parent && child.levelExtended<=parent.levelExtended && child.id!=child.pid){ | |
1057 child=getNode(child.pid); | |
1058 } | |
1059 return child==parent; | |
1060 } | |
1061 | |
1062 /** | |
1063 * Determine whether an organism is classified as X. | |
1064 * @param taxID taxID of organism. | |
1065 * @param ancestorID taxID of possible ancestor. | |
1066 * @return true if the organism is an X. | |
1067 */ | |
1068 public boolean descendsFrom2(int taxID, final int ancestorID) { | |
1069 TaxNode tn=getNode(taxID); | |
1070 while(tn.id!=tn.pid){ | |
1071 if(tn.id==ancestorID){return true;} | |
1072 tn=getNode(tn.pid); | |
1073 } | |
1074 return false; | |
1075 } | |
1076 | |
1077 /** Determine whether an organism is classified as a plant. */ | |
1078 public boolean isPlant(int taxID) {return descendsFrom2(taxID, VIRIDIPLANTAE_ID);} | |
1079 | |
1080 /** Determine whether an organism is classified as an animal. */ | |
1081 public boolean isAnimal(int taxID) {return descendsFrom2(taxID, METAZOA_ID);} | |
1082 | |
1083 /** Determine whether an organism is classified as a fungus. */ | |
1084 public boolean isFungus(int taxID) {return descendsFrom2(taxID, FUNGI_ID);} | |
1085 | |
1086 /** Determine whether an organism is classified as a eukaryote. */ | |
1087 public boolean isEukaryote(int taxID) {return descendsFrom2(taxID, EUKARYOTA_ID);} | |
1088 | |
1089 /** Determine whether an organism is classified as a prokaryote. */ | |
1090 public boolean isProkaryote(int taxID) { | |
1091 TaxNode tn=getNode(taxID); | |
1092 if(tn==null){ | |
1093 System.err.println("*** Warning: Can't find node "+taxID+" ***"); | |
1094 return false; | |
1095 } | |
1096 while(tn.id!=tn.pid){ | |
1097 if(tn.id==BACTERIA_ID || tn.id==ARCHAEA_ID){return true;} | |
1098 tn=getNode(tn.pid); | |
1099 } | |
1100 return false; | |
1101 } | |
1102 | |
1103 /** | |
1104 * Calculate the common ancestor of two nodes. | |
1105 * @param a TaxID of a node. | |
1106 * @param b TaxID of a node. | |
1107 * @return Common ancestor ID of a and b. | |
1108 */ | |
1109 public int commonAncestor(final int a, final int b){ | |
1110 TaxNode an=getNode(a), bn=getNode(b); | |
1111 assert(an!=null) : "Invalid taxID: "+a; | |
1112 assert(bn!=null) : "Invalid taxID: "+b; | |
1113 TaxNode cn=commonAncestor(an, bn); | |
1114 assert(cn!=null) : "No common ancestor: "+an+", "+bn; | |
1115 if(cn==null){return -1;} | |
1116 return cn.id; | |
1117 } | |
1118 | |
1119 /** | |
1120 * Calculate the common ancestor of two nodes. | |
1121 * @param a A node. | |
1122 * @param b A node. | |
1123 * @return Common ancestor of a and b. | |
1124 */ | |
1125 public TaxNode commonAncestor(TaxNode a, TaxNode b){ | |
1126 assert(a!=null && b!=null) : "Null parameters."; | |
1127 if(a==null){return b;} | |
1128 if(b==null){return a;} | |
1129 | |
1130 while(a!=b){ | |
1131 if(a.levelExtended<b.levelExtended){ | |
1132 a=getNode(a.pid); | |
1133 }else{ | |
1134 b=getNode(b.pid); | |
1135 } | |
1136 } | |
1137 return a; | |
1138 } | |
1139 | |
1140 /** | |
1141 * Identify the highest ancestor of a node; | |
1142 * this will presumably be "Life". | |
1143 * @param a Node | |
1144 * @return Highest ancestor | |
1145 */ | |
1146 public TaxNode highestAncestor(TaxNode a){ | |
1147 assert(a!=null); | |
1148 while(a.id!=a.pid){a=getNode(a.pid);} | |
1149 return a; | |
1150 } | |
1151 | |
1152 /*--------------------------------------------------------------*/ | |
1153 /*---------------- Header Parsing ----------------*/ | |
1154 /*--------------------------------------------------------------*/ | |
1155 | |
1156 /** | |
1157 * Determine the TaxID of a String, | |
1158 * without a loaded TaxTree. | |
1159 * This only works if the literal TaxID is embedded in the String. | |
1160 * @param header Typically a sequence header | |
1161 * @return Decoded TaxID, or -1 if unsuccessful | |
1162 */ | |
1163 public static int parseHeaderStatic(String header){ | |
1164 if(header.length()<3){return -1;} | |
1165 if(header.charAt(0)=='>'){header=header.substring(1);} | |
1166 if(!header.startsWith("tid|")){return -1;} | |
1167 int idx=3; | |
1168 int idx2=header.indexOf('|', 4); | |
1169 if(idx2<5){return -1;} | |
1170 int id=-1; | |
1171 try { | |
1172 id=Parse.parseInt(header, idx+1, idx2); | |
1173 // System.err.println("d"+", "+header.substring(idx+1, idx2)); | |
1174 } catch (Throwable e) { | |
1175 // System.err.println("e"+", "+header.substring(idx+1, idx2)); | |
1176 //ignore | |
1177 } | |
1178 return id; | |
1179 } | |
1180 | |
1181 /** | |
1182 * Determine the TaxID of a String. | |
1183 * @param header Typically a sequence header | |
1184 * @param bestEffort In some cases, try certain substrings if the name is not found. | |
1185 * @return | |
1186 */ | |
1187 public TaxNode parseNodeFromHeader(String header, boolean bestEffort){ | |
1188 if(header==null || header.length()<2){return null;} | |
1189 if(header.charAt(0)=='>'){header=header.substring(1);} | |
1190 TaxNode tn; | |
1191 if(SILVA_MODE){ | |
1192 tn=getNodeSilva(header, bestEffort); | |
1193 }else if(UNITE_MODE){ | |
1194 tn=getNodeUnite(header, bestEffort); | |
1195 }else{ | |
1196 final char delimiter=ncbiHeaderDelimiter(header); | |
1197 if(delimiter==' '){ | |
1198 tn=getNodeNewStyle(header); | |
1199 }else{ | |
1200 tn=getNodeOldStyle(header, delimiter); | |
1201 if(tn==null && delimiter=='|'){ | |
1202 // System.err.println("A: "+header); | |
1203 int id=-1; | |
1204 String[] split=header.split("\\|"); | |
1205 if(AccessionToTaxid.LOADED()){ | |
1206 for(int i=0; i<split.length && id<0; i++){//Try accessions first | |
1207 if(AccessionToTaxid.isValidAccession(split[i])){ | |
1208 id=AccessionToTaxid.get(split[i]); | |
1209 } | |
1210 } | |
1211 } | |
1212 for(int i=0; i<split.length && id<0; i++){//Then names | |
1213 id=parseNameToTaxid(split[i]); | |
1214 } | |
1215 // System.err.println("E: "+id); | |
1216 if(id>=0){tn=getNode(id);} | |
1217 // System.err.println("F: "+tn); | |
1218 } | |
1219 } | |
1220 } | |
1221 return tn; | |
1222 } | |
1223 | |
1224 /** | |
1225 * Guess the delimiter character in a String; | |
1226 * typically assumed to be '|', '~', or ' '. | |
1227 */ | |
1228 public static char ncbiHeaderDelimiter(String header){ | |
1229 for(int i=0; i<header.length(); i++){ | |
1230 final char c=header.charAt(i); | |
1231 if(c=='|' || c=='~'){ | |
1232 assert(i>0) : "i="+i+"; malformatted header '"+header+"'"; | |
1233 return c; | |
1234 }else if(Character.isWhitespace(c)){ | |
1235 return ' '; | |
1236 } | |
1237 } | |
1238 return ' '; | |
1239 } | |
1240 | |
1241 /** | |
1242 * Parse a Silva header to a Node. | |
1243 * @param s Silva header. | |
1244 * @param bestEffort Try certain substrings if the name is not found. | |
1245 * @return Node | |
1246 */ | |
1247 TaxNode getNodeSilva(String s, boolean bestEffort){ | |
1248 if(s==null){return null;} | |
1249 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ | |
1250 return getNodeOldStyle(s, s.charAt(3)); | |
1251 } | |
1252 String[] split=Tools.semiPattern.split(s); | |
1253 | |
1254 int number=-1; | |
1255 // final boolean chloroplast=(split.length>1 && split[split.length-1].equals("Chloroplast")); | |
1256 // if(chloroplast){return null;} | |
1257 for(int i=split.length-1; number<0 && i>=0; i--){ | |
1258 String last=split[i]; | |
1259 int paren=last.indexOf('('); | |
1260 if(paren>=0){last=last.substring(0, paren);} | |
1261 last=last.trim(); | |
1262 | |
1263 if(!last.startsWith("uncultured") && !last.startsWith("unidentified")){ | |
1264 number=parseNameToTaxid(last); | |
1265 } | |
1266 | |
1267 if(number>=0){return getNode(number);} | |
1268 else if(!bestEffort){break;} | |
1269 } | |
1270 return null; | |
1271 } | |
1272 | |
1273 /** | |
1274 * Parse a Unite header to a Node. | |
1275 * @param s Unite header. | |
1276 * @param bestEffort Try certain substrings if the name is not found. | |
1277 * @return Node | |
1278 */ | |
1279 TaxNode getNodeUnite(String s, boolean bestEffort){ | |
1280 if(s==null){return null;} | |
1281 if(s.length()>=5 && s.startsWith("tid") && (s.charAt(3)=='|' || s.charAt(3)=='~') && Tools.isDigit(s.charAt(4))){ | |
1282 return getNodeOldStyle(s, s.charAt(3)); | |
1283 } | |
1284 String[] split=Tools.pipePattern.split(s); | |
1285 | |
1286 int number=-1; | |
1287 String name=split[0]; | |
1288 String acc=split[1]; | |
1289 if(AccessionToTaxid.LOADED() && acc.length()>0){ | |
1290 number=AccessionToTaxid.get(acc); | |
1291 } | |
1292 if(number<1){ | |
1293 TaxNode tn=getNodeByName(name); | |
1294 if(tn!=null){number=tn.id;} | |
1295 } | |
1296 | |
1297 if(number>=0){return getNode(number);} | |
1298 return null; | |
1299 } | |
1300 | |
1301 /** Parses sequence headers using NCBI's old-style header system, prior to Accessions. */ | |
1302 private TaxNode getNodeOldStyle(final String s, char delimiter){ | |
1303 { | |
1304 int index=s.indexOf(delimiter); | |
1305 if(index<0){ | |
1306 delimiter='~'; | |
1307 index=s.indexOf(delimiter); | |
1308 if(index<0){ | |
1309 delimiter='_'; | |
1310 index=s.indexOf(delimiter); | |
1311 } | |
1312 } | |
1313 int number=-1; | |
1314 | |
1315 Throwable e=null; | |
1316 | |
1317 if(index==2 && s.length()>3 && s.startsWith("gi") && Tools.isDigit(s.charAt(3))){ | |
1318 // System.err.println("Parsing gi number."); | |
1319 | |
1320 if(GiToTaxid.isInitialized()){ | |
1321 try { | |
1322 number=GiToTaxid.parseGiToTaxid(s, delimiter); | |
1323 } catch (Throwable e2) { | |
1324 e=e2; | |
1325 } | |
1326 }else{ | |
1327 assert(!CRASH_IF_NO_GI_TABLE) : "To use gi numbers, you must load a gi table.\n"+s; | |
1328 } | |
1329 // if(number!=-1){System.err.println("number="+number);} | |
1330 }else if(index==3 && s.length()>4 && s.startsWith("tid") && Tools.isDigit(s.charAt(4))){ | |
1331 // System.err.println("Parsing ncbi number."); | |
1332 number=GiToTaxid.parseTaxidNumber(s, delimiter); | |
1333 }else if(index==3 && s.length()>4 && s.startsWith("img") && Tools.isDigit(s.charAt(4))){ | |
1334 // System.err.println("Parsing ncbi number."); | |
1335 long img=parseDelimitedNumber(s, delimiter); | |
1336 ImgRecord record=imgMap.get(img); | |
1337 number=(record==null ? -1 : record.taxID); | |
1338 }else if(index==4 && s.length()>5 && s.startsWith("ncbi") && Tools.isDigit(s.charAt(5))){//obsolete | |
1339 // System.err.println("Parsing ncbi number."); | |
1340 number=GiToTaxid.parseTaxidNumber(s, delimiter); | |
1341 } | |
1342 | |
1343 if(number<0 && index>=0 && (delimiter=='|' || delimiter=='~')){ | |
1344 String[] split=(delimiter=='|' ? delimiterPipe.split(s) : delimiterTilde.split(s)); | |
1345 if(AccessionToTaxid.LOADED()){ | |
1346 number=parseAccessionToTaxid(split); | |
1347 } | |
1348 if(number<0){ | |
1349 number=parseHeaderNameToTaxid(split); | |
1350 } | |
1351 } | |
1352 | |
1353 if(number<0 && e!=null){ | |
1354 assert(false) : e; | |
1355 throw new RuntimeException(e); | |
1356 } | |
1357 | |
1358 //TaxServer code could go here... | |
1359 | |
1360 if(number>=0){return getNode(number);} | |
1361 } | |
1362 if(verbose){System.err.println("Can't process name "+s);} | |
1363 if(Tools.isDigit(s.charAt(0)) && s.length()<=9){ | |
1364 try { | |
1365 return getNode(Integer.parseInt(s)); | |
1366 } catch (NumberFormatException e) { | |
1367 //ignore | |
1368 } | |
1369 } | |
1370 return null; | |
1371 } | |
1372 | |
1373 /** Parse a delimited number from a header, or return -1 if formatted incorrectly. */ | |
1374 static long parseDelimitedNumber(String s, char delimiter){ | |
1375 if(s==null){return -1;} | |
1376 int i=0; | |
1377 while(i<s.length() && s.charAt(i)!=delimiter){i++;} | |
1378 i++; | |
1379 if(i>=s.length() || !Tools.isDigit(s.charAt(i))){return -1;} | |
1380 | |
1381 long number=0; | |
1382 while(i<s.length()){ | |
1383 char c=s.charAt(i); | |
1384 if(c==delimiter || c==' ' || c=='\t'){break;} | |
1385 assert(Tools.isDigit(c)) : c+"\n"+s; | |
1386 number=(number*10)+(c-'0'); | |
1387 i++; | |
1388 } | |
1389 return number; | |
1390 } | |
1391 | |
1392 /** Parses sequence headers using NCBI's current header system, with Accessions. */ | |
1393 private TaxNode getNodeNewStyle(final String s){ | |
1394 | |
1395 int space=s.indexOf(' '); | |
1396 int number=-1; | |
1397 | |
1398 if(AccessionToTaxid.LOADED()){ | |
1399 if(space>0){ | |
1400 number=AccessionToTaxid.get(s.substring(0, space)); | |
1401 }else{ | |
1402 number=AccessionToTaxid.get(s); | |
1403 } | |
1404 } | |
1405 | |
1406 if(number<0 && Tools.isDigit(s.charAt(0)) && s.length()<=9 && space<0){ | |
1407 try { | |
1408 return getNode(Integer.parseInt(s)); | |
1409 } catch (NumberFormatException e) { | |
1410 //ignore | |
1411 } | |
1412 } | |
1413 | |
1414 if(number<0 && space>0){ | |
1415 number=parseNameToTaxid(s.substring(space+1)); | |
1416 } | |
1417 | |
1418 if(number>-1){return getNode(number);} | |
1419 if(space<0 && s.indexOf('_')>0){ | |
1420 return getNodeNewStyle(s.replace('_', ' ')); | |
1421 } | |
1422 return null; | |
1423 } | |
1424 | |
1425 /** | |
1426 * For parsing old-style NCBI headers. | |
1427 */ | |
1428 public int parseAccessionToTaxid(String[] split){ | |
1429 if(split.length<4){ | |
1430 return -1; | |
1431 } | |
1432 int ncbi=AccessionToTaxid.get(split[3]); | |
1433 return ncbi; | |
1434 } | |
1435 | |
1436 /** | |
1437 * For parsing old-style NCBI headers. | |
1438 */ | |
1439 public int parseHeaderNameToTaxid(String[] split){ | |
1440 if(split.length<5){ | |
1441 return -1; | |
1442 } | |
1443 return parseNameToTaxid(split[4]); | |
1444 } | |
1445 | |
1446 /** | |
1447 * Returns the TaxID from the organism's scientific name (e.g. "Homo sapiens"). | |
1448 * If multiple nodes share the same name, returns the first; to get the full list, | |
1449 * use getNodesByNameExtended. | |
1450 * @param name Organism name. | |
1451 * @return Organism TaxID, or -1 if not found. | |
1452 */ | |
1453 public int parseNameToTaxid(String name){ | |
1454 // assert(false) : name+", "+(nameMap==null)+", "+(nameMap==null ? 0 : nameMap.size()); | |
1455 List<TaxNode> list=null; | |
1456 | |
1457 list=getNodesByNameExtended(name); | |
1458 | |
1459 if(list==null || list.size()>1){return -1;} | |
1460 return list.get(0).id; | |
1461 } | |
1462 | |
1463 /** | |
1464 * Fetch nodes indicated by this name. | |
1465 * @param name A taxonomic name delimited by space or underscore. | |
1466 * @return Nodes corresponding to the name. | |
1467 */ | |
1468 public List<TaxNode> getNodesByNameExtended(String name){ | |
1469 List<TaxNode> list=null; | |
1470 | |
1471 list=getNodesByName(name); | |
1472 if(list!=null){return list;} | |
1473 | |
1474 name=name.replaceAll("_", " ").trim(); | |
1475 list=getNodesByName(name); | |
1476 if(list!=null){return list;} | |
1477 | |
1478 String[] split2=name.split(" "); | |
1479 | |
1480 if(split2.length>7){ | |
1481 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]+" "+split2[7]; | |
1482 list=getNodesByName(term); | |
1483 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1484 if(list!=null){return list;} | |
1485 } | |
1486 | |
1487 if(split2.length>6){ | |
1488 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]+" "+split2[6]; | |
1489 list=getNodesByName(term); | |
1490 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1491 if(list!=null){return list;} | |
1492 } | |
1493 | |
1494 if(split2.length>5){ | |
1495 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]+" "+split2[5]; | |
1496 list=getNodesByName(term); | |
1497 // System.err.println("6:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1498 if(list!=null){return list;} | |
1499 } | |
1500 if(split2.length>4){ | |
1501 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]+" "+split2[4]; | |
1502 list=getNodesByName(term); | |
1503 // System.err.println("5:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1504 if(list!=null){return list;} | |
1505 } | |
1506 if(split2.length>3){ | |
1507 String term=split2[0]+" "+split2[1]+" "+split2[2]+" "+split2[3]; | |
1508 list=getNodesByName(term); | |
1509 // System.err.println("4:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1510 if(list!=null){return list;} | |
1511 } | |
1512 if(split2.length>2){ | |
1513 String term=split2[0]+" "+split2[1]+" "+split2[2]; | |
1514 list=getNodesByName(term); | |
1515 // System.err.println("3:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1516 if(list!=null){return list;} | |
1517 } | |
1518 if(split2.length>1){ | |
1519 String term=split2[0]+" "+split2[1]; | |
1520 list=getNodesByName(term); | |
1521 // System.err.println("2:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1522 if(list!=null){return list;} | |
1523 } | |
1524 if(split2.length>0){ | |
1525 String term=split2[0]; | |
1526 list=getNodesByName(term); | |
1527 // System.err.println("1:\n"+Arrays.toString(split)+"\n"+Arrays.toString(split2)+"\n"+term+" -> "+list); | |
1528 if(list!=null){return list;} | |
1529 } | |
1530 | |
1531 return null; | |
1532 } | |
1533 | |
1534 /*--------------------------------------------------------------*/ | |
1535 /*---------------- Assorted Methods ----------------*/ | |
1536 /*--------------------------------------------------------------*/ | |
1537 | |
1538 /** | |
1539 * Return the TaxID of the lowest ancestor node at least the specified level, | |
1540 * including this node itself. Level is the normal (non-extended) level. | |
1541 * @param taxID | |
1542 * @param taxLevel | |
1543 * @return | |
1544 */ | |
1545 public int promote(final int taxID, int taxLevel){ | |
1546 TaxNode tn=null; | |
1547 tn=(taxID<1 ? null : getNode(taxID)); | |
1548 tn=promote(tn, taxLevel); | |
1549 return (tn==null ? taxID : tn.id); | |
1550 } | |
1551 | |
1552 /** | |
1553 * Fetch the first node in this node's lineage of at least the indicated level. | |
1554 * This can be the node itself or an ancestor. | |
1555 * @see getNodeAtLevelExtended | |
1556 * @param tn Node in question | |
1557 * @param taxLevel Desired minimum level | |
1558 * @return A node at the desired level | |
1559 */ | |
1560 public TaxNode promote(TaxNode tn, int taxLevel){ | |
1561 while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){ | |
1562 TaxNode temp=getNode(tn.pid); | |
1563 if(temp==null || temp==tn || temp.level>=TaxTree.LIFE || temp.level>taxLevel){break;} | |
1564 tn=temp; | |
1565 } | |
1566 return tn; | |
1567 } | |
1568 | |
1569 /** | |
1570 * Determine the TaxID of the node's parent. | |
1571 * @param id TaxID of child node | |
1572 * @return Parent TaxID | |
1573 */ | |
1574 public int getParentID(int id){ | |
1575 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." | |
1576 + "\nPlease redownload it and regenerate the taxtree."; | |
1577 if(id<0 || id>=nodes.length){return -1;} | |
1578 TaxNode tn=nodes[id]; | |
1579 if(tn==null && mergedMap!=null){tn=getNode(mergedMap.get(id), true);} | |
1580 return tn==null ? -1 : tn.pid; | |
1581 } | |
1582 | |
1583 /** | |
1584 * Fetch the node with this TaxID. | |
1585 * @param id TaxID | |
1586 * @return Node | |
1587 */ | |
1588 public TaxNode getNode(int id){ | |
1589 assert(id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." | |
1590 + "\nPlease redownload it and regenerate the taxtree."; | |
1591 if(id<0 || id>=nodes.length){return null;} | |
1592 TaxNode tn=nodes[id]; | |
1593 if(tn!=null || mergedMap==null){return tn;} | |
1594 return getNode(mergedMap.get(id), true); | |
1595 } | |
1596 | |
1597 /** | |
1598 * Fetch the node with this TaxID, but don't throw assertions upon failure. | |
1599 * @param id TaxID | |
1600 * @return Node | |
1601 */ | |
1602 public TaxNode getNode(int id, boolean skipAssertion){ | |
1603 assert(skipAssertion || id<nodes.length) : id+", "+nodes.length+"\nYou have encountered a TaxID more recent than your NCBI dump." | |
1604 + "\nPlease redownload it and regenerate the taxtree."; | |
1605 if(id<0 || id>=nodes.length){return null;} | |
1606 TaxNode tn=nodes[id]; | |
1607 if(tn!=null || mergedMap==null){return tn;} | |
1608 return getNode(mergedMap.get(id), true); | |
1609 } | |
1610 | |
1611 public TaxNode getNodeAtLevel(int id, int minLevel){ | |
1612 return getNodeAtLevel(id, minLevel, DOMAIN); | |
1613 } | |
1614 | |
1615 public TaxNode getNodeAtLevelExtended(int id, int minLevelE){ | |
1616 return getNodeAtLevelExtended(id, minLevelE, DOMAIN_E); | |
1617 } | |
1618 | |
1619 public TaxNode getNodeAtLevel(int id, int minLevel, int maxLevel){ | |
1620 final int minLevelExtended=levelToExtended(minLevel); | |
1621 final int maxLevelExtended=levelToExtended(maxLevel); | |
1622 return getNodeAtLevelExtended(id, minLevelExtended, maxLevelExtended); | |
1623 } | |
1624 | |
1625 public TaxNode getNodeAtLevelExtended(int id, int minLevelE, int maxLevelE){ | |
1626 TaxNode tn=getNode(id); | |
1627 while(tn!=null && tn.pid!=tn.id && tn.levelExtended<minLevelE){ | |
1628 TaxNode temp=getNode(tn.pid); | |
1629 if(temp==null || temp.levelExtended>maxLevelE){break;} | |
1630 tn=temp; | |
1631 } | |
1632 return tn; | |
1633 } | |
1634 | |
1635 public int getIdAtLevelExtended(int taxID, int taxLevelExtended){ | |
1636 if(taxLevelExtended<0){return taxID;} | |
1637 TaxNode tn=getNode(taxID); | |
1638 while(tn!=null && tn.id!=tn.pid && tn.levelExtended<taxLevelExtended){ | |
1639 tn=getNode(tn.pid); | |
1640 if(tn.levelExtended>taxLevelExtended){break;} | |
1641 taxID=tn.id; | |
1642 } | |
1643 return taxID; | |
1644 } | |
1645 | |
1646 /** | |
1647 * Fetch the node with this name. | |
1648 * Throw an assertion if there are multiple such nodes. | |
1649 * @param s Organism name. | |
1650 * @return Node with given name. | |
1651 */ | |
1652 public TaxNode getNodeByName(String s){ | |
1653 List<TaxNode> list=getNodesByName(s, false); | |
1654 if(list==null){list=getNodesByName(s, true);} | |
1655 if(list==null || list.isEmpty()){return null;} | |
1656 if(list.size()==1){return list.get(0);} | |
1657 assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n"; | |
1658 TaxNode a=list.get(0); | |
1659 for(int i=1; i<list.size(); i++){ | |
1660 TaxNode b=list.get(i); | |
1661 //Keep the most specific node | |
1662 // if(a==null || (b!=null && b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf())){//not necessary | |
1663 if(b.minAncestorLevelIncludingSelf()<a.minAncestorLevelIncludingSelf()){ | |
1664 a=b; | |
1665 } | |
1666 } | |
1667 return a; | |
1668 } | |
1669 | |
1670 /** | |
1671 * Fetch a list of all nodes with this name. | |
1672 * @param s Organism name. | |
1673 * @return Nodes with given name. | |
1674 */ | |
1675 public List<TaxNode> getNodesByName(String s){ | |
1676 List<TaxNode> list=getNodesByName(s, false); | |
1677 if(list==null){list=getNodesByName(s, true);} | |
1678 return list; | |
1679 } | |
1680 | |
1681 /** | |
1682 * Fetch a map of names to nodes. If absent, create it first. | |
1683 * @param lowercase If true, return the map with lowercase keys. | |
1684 * @return Map of names to nodes. | |
1685 */ | |
1686 private HashMap<String, ArrayList<TaxNode>> getMap(boolean lowercase){ | |
1687 HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap); | |
1688 if(map==null){ | |
1689 synchronized(this){hashNames(true);} | |
1690 map=(lowercase ? nameMapLower : nameMap); | |
1691 } | |
1692 assert(map!=null) : "Tax names were not hashed."; | |
1693 return map; | |
1694 } | |
1695 | |
1696 private List<TaxNode> getNodesByName(String s, boolean lowercase){ | |
1697 if(s==null){return null;} | |
1698 if(s.indexOf('_')>=0){s=s.replace('_', ' ');} | |
1699 if(lowercase){s=s.toLowerCase();} | |
1700 // System.err.println("Searching for "+s); | |
1701 final HashMap<String, ArrayList<TaxNode>> map=getMap(lowercase); | |
1702 ArrayList<TaxNode> list=map.get(s); | |
1703 if(list!=null){return list;} | |
1704 // System.err.println("No matches for '"+s+"'"); | |
1705 | |
1706 // assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s); | |
1707 | |
1708 if(s.indexOf('_')<0 && s.indexOf(' ')<0){return null;} | |
1709 String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8); | |
1710 // System.err.println("Array: "+Arrays.toString(split)); | |
1711 list=map.get(split[split.length-1]); | |
1712 if(list==null){return list;} | |
1713 // System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")"); | |
1714 | |
1715 int matchCount=0; | |
1716 for(TaxNode tn : list){ | |
1717 if(tn.matchesName(split, split.length-1, this)){matchCount++;} | |
1718 } | |
1719 if(matchCount==list.size()){return list;} | |
1720 if(matchCount<1){return null;} | |
1721 ArrayList<TaxNode> hits=new ArrayList<TaxNode>(matchCount); | |
1722 for(TaxNode tn : list){ | |
1723 if(tn.matchesName(split, split.length-1, this)){hits.add(tn);} | |
1724 } | |
1725 return hits; | |
1726 } | |
1727 public ArrayList<TaxNode> getAncestors(int id){ | |
1728 TaxNode current=getNode(id); | |
1729 ArrayList<TaxNode> list=new ArrayList<TaxNode>(); | |
1730 while(current!=null && current.pid!=current.id){//ignores root | |
1731 list.add(current); | |
1732 current=getNode(current.pid); | |
1733 } | |
1734 //optionally add root here | |
1735 return list; | |
1736 } | |
1737 | |
1738 public void increment(IntList ids, IntList counts, boolean sync){ | |
1739 | |
1740 ids.sort(); | |
1741 ids.getUniqueCounts(counts); | |
1742 | |
1743 if(!sync){ | |
1744 for(int i=0; i<ids.size; i++){ | |
1745 int id=ids.get(i); | |
1746 int count=counts.get(i); | |
1747 incrementRaw(id, count); | |
1748 } | |
1749 }else{ | |
1750 synchronized(this){ | |
1751 for(int i=0; i<ids.size; i++){ | |
1752 int id=ids.get(i); | |
1753 int count=counts.get(i); | |
1754 incrementRaw(id, count); | |
1755 } | |
1756 } | |
1757 } | |
1758 } | |
1759 | |
1760 public void incrementRaw(int id, long amt){ | |
1761 assert(id>=0 && id<nodes.length) : "TaxID "+id+" is out of range."+(id<0 ? "" : " Possibly the taxonomy data needs to be updated."); | |
1762 assert(nodes[id]!=null) : "No node for TaxID "+id+"; possibly the taxonomy data needs to be updated."; | |
1763 nodes[id].incrementRaw(amt); | |
1764 } | |
1765 | |
1766 public void percolateUp(){ | |
1767 for(int i=0; i<treeLevelsExtended.length; i++){ | |
1768 percolateUp(i); | |
1769 } | |
1770 } | |
1771 | |
1772 public void percolateUp(final int fromLevel){ | |
1773 final TaxNode[] stratum=treeLevelsExtended[fromLevel]; | |
1774 for(final TaxNode n : stratum){ | |
1775 n.incrementSum(n.countRaw); | |
1776 TaxNode parent=nodes[n.pid]; | |
1777 if(n!=parent){ | |
1778 parent.incrementSum(n.countSum); | |
1779 } | |
1780 } | |
1781 } | |
1782 | |
1783 /** Add this amount to the node and all its ancestors. */ | |
1784 public void percolateUp(TaxNode node, long amt){ | |
1785 if(amt==0){return;} | |
1786 if(verbose){System.err.println("percolateUp("+amt+") node: "+node);} | |
1787 while(node.id!=node.pid){ | |
1788 node.incrementSum(amt); | |
1789 node=nodes[node.pid]; | |
1790 } | |
1791 node.incrementSum(amt); | |
1792 } | |
1793 | |
1794 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){ | |
1795 return gatherNodesAtLeastLimit(limit, 0, nodesPerLevelExtended.length-1); | |
1796 } | |
1797 | |
1798 public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){ | |
1799 final int minLevelExtended=levelToExtended(minLevel); | |
1800 final int maxLevelExtended=levelToExtended(maxLevel); | |
1801 // assert(false) : limit+", "+minLevel+", "+maxLevel+", "+minLevelExtended+", "+maxLevelExtended; | |
1802 ArrayList<TaxNode> list=new ArrayList<TaxNode>(); | |
1803 for(int i=minLevelExtended; i<nodesPerLevelExtended.length && i<=maxLevelExtended; i++){ | |
1804 list.addAll(gatherNodesAtLeastLimitExtended(i, limit)); | |
1805 } | |
1806 Shared.sort(list, TaxNode.countComparator); | |
1807 return list; | |
1808 } | |
1809 | |
1810 public ArrayList<TaxNode> gatherNodesAtLeastLimitExtended(final int fromLevelExtended, final long limit){ | |
1811 ArrayList<TaxNode> list=new ArrayList<TaxNode>(); | |
1812 final TaxNode[] stratum=treeLevelsExtended[fromLevelExtended]; | |
1813 for(final TaxNode n : stratum){ | |
1814 if(n.countSum>=limit){ | |
1815 list.add(n); | |
1816 TaxNode parent=nodes[n.pid]; | |
1817 if(n!=parent){ | |
1818 percolateUp(parent, -n.countSum);//123 This was negative for some reason | |
1819 } | |
1820 } | |
1821 } | |
1822 Shared.sort(list, TaxNode.countComparator); | |
1823 return list; | |
1824 } | |
1825 | |
1826 /*--------------------------------------------------------------*/ | |
1827 /*---------------- Static Initializers ----------------*/ | |
1828 /*--------------------------------------------------------------*/ | |
1829 | |
1830 /** | |
1831 * Generate the name to level number map. | |
1832 */ | |
1833 private static HashMap<String, Integer> makeLevelMap() { | |
1834 HashMap<String, Integer> map=new HashMap<String, Integer>(31); | |
1835 for(int i=0; i<taxLevelNames.length; i++){ | |
1836 map.put(taxLevelNames[i], i); | |
1837 map.put(taxLevelNames[i].toUpperCase(), i); | |
1838 } | |
1839 map.put("clade", NO_RANK); | |
1840 map.put("clade".toUpperCase(), NO_RANK); | |
1841 return map; | |
1842 } | |
1843 | |
1844 /** | |
1845 * Generate the name to extended level number map. | |
1846 */ | |
1847 private static HashMap<String, Integer> makeLevelMapExtended() { | |
1848 HashMap<String, Integer> map=new HashMap<String, Integer>(129); | |
1849 for(int i=0; i<taxLevelNamesExtended.length; i++){ | |
1850 map.put(taxLevelNamesExtended[i], i); | |
1851 map.put(taxLevelNamesExtended[i].toUpperCase(), i); | |
1852 } | |
1853 map.put("clade", NO_RANK_E); | |
1854 map.put("clade".toUpperCase(), NO_RANK_E); | |
1855 return map; | |
1856 } | |
1857 | |
1858 /** | |
1859 * I think this maps normal and extend names to normal level numbers. | |
1860 */ | |
1861 private static HashMap<String, Integer> makeAltLevelMap() { | |
1862 HashMap<String, Integer> map=new HashMap<String, Integer>(129); | |
1863 for(int i=0; i<taxLevelNames.length; i++){ | |
1864 map.put(taxLevelNames[i], i); | |
1865 map.put(taxLevelNames[i].toUpperCase(), i); | |
1866 } | |
1867 map.put("clade", NO_RANK); | |
1868 map.put("clade".toUpperCase(), NO_RANK); | |
1869 | |
1870 //Add synonyms | |
1871 // map.put("subfamily", map.get("family")); | |
1872 // map.put("tribe", map.get("family")); | |
1873 // map.put("varietas", map.get("subspecies")); | |
1874 // map.put("subgenus", map.get("genus")); | |
1875 // map.put("forma", map.get("subspecies")); | |
1876 // map.put("species group", map.get("genus")); | |
1877 // map.put("species subgroup", map.get("genus")); | |
1878 // map.put("cohort", map.get("class")); | |
1879 // map.put("subclass", map.get("class")); | |
1880 // map.put("infraorder", map.get("order")); | |
1881 // map.put("superorder", map.get("class")); | |
1882 // map.put("subphylum", map.get("phylum")); | |
1883 // map.put("infraclass", map.get("class")); | |
1884 // map.put("superkingdom", map.get("division")); | |
1885 // map.put("parvorder", map.get("order")); | |
1886 // map.put("superclass", map.get("phylum")); | |
1887 // map.put("superphylum", map.get("kingdom")); | |
1888 // map.put("subkingdom", map.get("kingdom")); | |
1889 // map.put("superfamily", map.get("order")); | |
1890 // map.put("superkingdom", map.get("domain")); | |
1891 // map.put("suborder", map.get("order")); | |
1892 // map.put("subtribe", map.get("family")); | |
1893 | |
1894 for(String[] array : taxLevelNamesExtendedMatrix){ | |
1895 String head=array[array.length-1]; | |
1896 Integer value=map.get(head); | |
1897 assert(value!=null) : head; | |
1898 for(String key : array){ | |
1899 if(key!=head){ | |
1900 assert(!map.containsKey(key)) : "Map already contains key "+key+": "+Arrays.toString(array); | |
1901 map.put(key, value); | |
1902 map.put(key.toUpperCase(), value); | |
1903 } | |
1904 } | |
1905 } | |
1906 | |
1907 return map; | |
1908 } | |
1909 | |
1910 /*--------------------------------------------------------------*/ | |
1911 /*---------------- Size ----------------*/ | |
1912 /*--------------------------------------------------------------*/ | |
1913 | |
1914 /** Number of bp associated with this node in RefSeq */ | |
1915 public long toSize(TaxNode tn){ | |
1916 if(tn==null){return 0;} | |
1917 if(refseqSizeMap==null){return -1L;} | |
1918 final long x=refseqSizeMap.get(tn.id); | |
1919 return Tools.max(0, x); | |
1920 } | |
1921 | |
1922 /** Number of bp associated with this node and descendants in RefSeq */ | |
1923 public long toSizeC(TaxNode tn){ | |
1924 if(tn==null){return 0;} | |
1925 if(refseqSizeMapC==null){return -1L;} | |
1926 final long x=refseqSizeMapC.get(tn.id); | |
1927 return Tools.max(0, x); | |
1928 } | |
1929 | |
1930 /** Number of sequences associated with this node in RefSeq */ | |
1931 public int toSeqs(TaxNode tn){ | |
1932 if(tn==null){return 0;} | |
1933 if(refseqSeqMap==null){return -1;} | |
1934 final int x=refseqSeqMap.get(tn.id); | |
1935 return Tools.max(0, x); | |
1936 } | |
1937 | |
1938 /** Number of sequences associated with this node and descandants in RefSeq */ | |
1939 public long toSeqsC(TaxNode tn){ | |
1940 if(tn==null){return 0;} | |
1941 if(refseqSeqMapC==null){return -1L;} | |
1942 final long x=refseqSeqMapC.get(tn.id); | |
1943 return Tools.max(0, x); | |
1944 } | |
1945 | |
1946 /** Number of descendants of this node */ | |
1947 public int toNodes(TaxNode tn){ | |
1948 if(tn==null){return 0;} | |
1949 if(nodeMapC==null){return -1;} | |
1950 final int x=nodeMapC.get(tn.id); | |
1951 return Tools.max(0, x); | |
1952 } | |
1953 | |
1954 /** | |
1955 * Fills refseqSizeMap, refseqSizeMapC, etc. from a file containing the summary. | |
1956 * @param fname Size file name | |
1957 */ | |
1958 public void loadSizeFile(String fname){ | |
1959 if(fname==null){return;} | |
1960 assert(refseqSizeMap==null); | |
1961 refseqSizeMap=new IntLongHashMap(); | |
1962 refseqSizeMapC=new IntLongHashMap(); | |
1963 refseqSeqMap=new IntHashMap(); | |
1964 refseqSeqMapC=new IntLongHashMap(); | |
1965 nodeMapC=new IntHashMap(); | |
1966 | |
1967 final ByteFile bf=ByteFile.makeByteFile(fname, true); | |
1968 final byte delimiter='\t'; | |
1969 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ | |
1970 if(line.length>0 && line[0]!='#'){ | |
1971 int a=0, b=0; | |
1972 | |
1973 while(b<line.length && line[b]!=delimiter){b++;} | |
1974 assert(b>a) : "Missing field 0: "+new String(line); | |
1975 int tid=Parse.parseInt(line, a, b); | |
1976 b++; | |
1977 a=b; | |
1978 | |
1979 while(b<line.length && line[b]!=delimiter){b++;} | |
1980 assert(b>a) : "Missing field 1: "+new String(line); | |
1981 long size=Parse.parseLong(line, a, b); | |
1982 b++; | |
1983 a=b; | |
1984 | |
1985 while(b<line.length && line[b]!=delimiter){b++;} | |
1986 assert(b>a) : "Missing field 2: "+new String(line); | |
1987 long csize=Parse.parseLong(line, a, b); | |
1988 b++; | |
1989 a=b; | |
1990 | |
1991 while(b<line.length && line[b]!=delimiter){b++;} | |
1992 assert(b>a) : "Missing field 3: "+new String(line); | |
1993 int seqs=Parse.parseInt(line, a, b); | |
1994 b++; | |
1995 a=b; | |
1996 | |
1997 while(b<line.length && line[b]!=delimiter){b++;} | |
1998 assert(b>a) : "Missing field 4: "+new String(line); | |
1999 long cseqs=Parse.parseLong(line, a, b); | |
2000 b++; | |
2001 a=b; | |
2002 | |
2003 while(b<line.length && line[b]!=delimiter){b++;} | |
2004 assert(b>a) : "Missing field 5: "+new String(line); | |
2005 int cnodes=Parse.parseInt(line, a, b); | |
2006 b++; | |
2007 a=b; | |
2008 | |
2009 if(refseqSizeMap!=null && size>0){refseqSizeMap.put(tid, size);} | |
2010 if(refseqSizeMapC!=null && csize>0){refseqSizeMapC.put(tid, csize);} | |
2011 if(refseqSeqMap!=null && seqs>0){refseqSeqMap.put(tid, seqs);} | |
2012 if(refseqSeqMapC!=null && cseqs>0){refseqSeqMapC.put(tid, cseqs);} | |
2013 if(nodeMapC!=null && cnodes>0){nodeMapC.put(tid, cnodes);} | |
2014 } | |
2015 } | |
2016 bf.close(); | |
2017 } | |
2018 | |
2019 /*--------------------------------------------------------------*/ | |
2020 /*---------------- IMG ----------------*/ | |
2021 /*--------------------------------------------------------------*/ | |
2022 | |
2023 public static int imgToTaxid(long img){ | |
2024 ImgRecord ir=imgMap.get(img); | |
2025 // assert(false) : "\n"+img+"\n"+imgMap.get(img)+"\n"+562+"\n"+imgMap.get(562)+"\n"+imgMap.size()+"\n"+IMGHQ+"\n"+defaultImgFile()+"\n"; | |
2026 return ir==null ? -1 : ir.taxID; | |
2027 } | |
2028 | |
2029 public TaxNode imgToTaxNode(long img){ | |
2030 int tid=imgToTaxid(img); | |
2031 return tid<1 ? null : getNode(tid); | |
2032 } | |
2033 | |
2034 // public static int loadIMGOld(String fname, boolean storeName, PrintStream outstream){ | |
2035 // assert(imgMap==null); | |
2036 // if(fname==null){return 0;} | |
2037 // ImgRecord2.storeName=storeName; | |
2038 // if(outstream!=null){System.err.println("Loading IMG.");} | |
2039 // Timer t=new Timer(outstream, false); | |
2040 // ImgRecord2[] array=ImgRecord2.toArray(fname); | |
2041 // int x=loadIMG(array); | |
2042 // t.stopAndPrint(); | |
2043 // return x; | |
2044 // } | |
2045 | |
2046 public static int loadIMG(String fname, boolean storeName, PrintStream outstream){ | |
2047 assert(imgMap==null); | |
2048 if(fname==null){return 0;} | |
2049 ImgRecord.storeName=storeName; | |
2050 if(outstream!=null){System.err.println("Loading IMG.");} | |
2051 Timer t=new Timer(outstream, false); | |
2052 ImgRecord[] array=ImgRecord.toArray(fname, IMG_HQ); | |
2053 int x=loadIMG(array); | |
2054 t.stopAndPrint(); | |
2055 return x; | |
2056 } | |
2057 | |
2058 public static int loadIMG(ImgRecord[] array){ | |
2059 assert(imgMap==null); | |
2060 imgMap=new HashMap<Long, ImgRecord>((int)(array.length*1.5)); | |
2061 for(ImgRecord record : array){ | |
2062 imgMap.put(record.imgID, record); | |
2063 } | |
2064 return imgMap.size(); | |
2065 } | |
2066 | |
2067 @Deprecated | |
2068 public static int parseLevel(String b){ | |
2069 final int level; | |
2070 if(b==null){level=-1;} | |
2071 else if(Tools.isNumeric(b.charAt(0))){ | |
2072 level=Integer.parseInt(b); | |
2073 }else{ | |
2074 level=stringToLevel(b.toLowerCase()); | |
2075 } | |
2076 return level; | |
2077 } | |
2078 | |
2079 public static int parseLevelExtended(String b){ | |
2080 final int level; | |
2081 if(b==null){level=-1;} | |
2082 else if(Tools.isNumeric(b.charAt(0))){ | |
2083 level=levelToExtended(Integer.parseInt(b)); | |
2084 }else{ | |
2085 level=stringToLevelExtended(b.toLowerCase()); | |
2086 } | |
2087 return level; | |
2088 } | |
2089 | |
2090 public boolean isUnclassified(int tid){ | |
2091 TaxNode tn=getNode(tid); | |
2092 while(tn!=null && tn.id!=tn.pid){ | |
2093 if(tn.isUnclassified()){return true;} | |
2094 if(tn.pid==tn.id){break;} | |
2095 tn=getNode(tn.pid); | |
2096 } | |
2097 return false; | |
2098 } | |
2099 | |
2100 public boolean isEnvironmentalSample(int tid){ | |
2101 TaxNode tn=getNode(tid); | |
2102 while(tn!=null && tn.id!=tn.pid){ | |
2103 if(tn.isEnvironmentalSample()){return true;} | |
2104 if(tn.pid==tn.id){break;} | |
2105 tn=getNode(tn.pid); | |
2106 } | |
2107 return false; | |
2108 } | |
2109 | |
2110 public boolean isVirus(int tid){ | |
2111 TaxNode tn=getNode(tid); | |
2112 while(tn!=null && tn.id!=tn.pid){ | |
2113 if(tn.id==VIRUSES_ID){return true;} | |
2114 if(tn.pid==tn.id){break;} | |
2115 tn=getNode(tn.pid); | |
2116 } | |
2117 return false; | |
2118 } | |
2119 | |
2120 public long definedLevels(int tid){ | |
2121 long levels=0; | |
2122 TaxNode tn=getNode(tid); | |
2123 while(tn!=null && tn.id!=tn.pid){ | |
2124 levels=levels|(1L<<tn.level); | |
2125 } | |
2126 return levels; | |
2127 } | |
2128 | |
2129 public long definedLevelsExtended(int tid){ | |
2130 long levels=0; | |
2131 TaxNode tn=getNode(tid); | |
2132 while(tn!=null && tn.id!=tn.pid){ | |
2133 levels=levels|(1L<<tn.levelExtended); | |
2134 } | |
2135 return levels; | |
2136 } | |
2137 | |
2138 /** | |
2139 * Generates the mononomial name for this taxonomic level based on the scientific name. | |
2140 * For example, "Homo sapiens" -> "Sapiens" | |
2141 * @param tid TaxID | |
2142 * @return Correct name for this node. | |
2143 */ | |
2144 public String mononomial(int tid){return mononomial(getNode(tid));} | |
2145 public String mononomial(TaxNode tn){ | |
2146 if(tn==null){return null;} | |
2147 String name=tn.name; | |
2148 if(name.indexOf(' ')<0){return name;} | |
2149 TaxNode parent=getNode(tn.pid); | |
2150 if(parent==null){return name;} | |
2151 String pname=parent.name; | |
2152 if(name.length()>pname.length() && name.charAt(pname.length())==' ' && name.startsWith(pname)){ | |
2153 name=name.substring(pname.length()+1); | |
2154 } | |
2155 return name; | |
2156 } | |
2157 | |
2158 /*--------------------------------------------------------------*/ | |
2159 /*---------------- Fields ----------------*/ | |
2160 /*--------------------------------------------------------------*/ | |
2161 | |
2162 /** All nodes in the tree in a flat array, indexed by TaxiD */ | |
2163 public final TaxNode[] nodes; | |
2164 | |
2165 /** Number of nodes per normal level */ | |
2166 public final int[] nodesPerLevel=new int[taxLevelNames.length]; | |
2167 | |
2168 /** Number of nodes per extended level */ | |
2169 public final int[] nodesPerLevelExtended=new int[taxLevelNamesExtended.length]; | |
2170 | |
2171 /** Number of nodes in the tree */ | |
2172 public final int nodeCount; | |
2173 | |
2174 /** Maps old TaxIDs to new TaxIDs */ | |
2175 public final IntHashMap mergedMap; | |
2176 | |
2177 /** Arrays of all nodes at a given taxonomic level (extended) */ | |
2178 public final TaxNode[][] treeLevelsExtended=new TaxNode[taxLevelNamesExtended.length][]; | |
2179 | |
2180 /** Map of names to nodes */ | |
2181 HashMap<String, ArrayList<TaxNode>> nameMap; | |
2182 /** Map of lowercase names to nodes */ | |
2183 HashMap<String, ArrayList<TaxNode>> nameMapLower; | |
2184 /** Map of nodes to child nodes */ | |
2185 HashMap<TaxNode, ArrayList<TaxNode>> childMap; | |
2186 public HashMap<String, ArrayList<TaxNode>> nameMap(){return nameMap;} | |
2187 | |
2188 @Deprecated | |
2189 public int minValidTaxa=0; //TODO: Remove (will break serialization) | |
2190 | |
2191 /** Infer ranks for no-rank nodes, when possible */ | |
2192 public boolean simplify=true; | |
2193 /** See simplify() for details, works in conjunction with simplify */ | |
2194 public boolean reassign=true; | |
2195 /** Discard no-rank nodes */ | |
2196 public boolean skipNorank=false; | |
2197 public int inferRankLimit=0;//levelMap.get("species"); | |
2198 | |
2199 //Node Statistics | |
2200 /** Number of bases assigned to this TaxID in RefSeq */ | |
2201 private IntLongHashMap refseqSizeMap; | |
2202 /** Number of bases assigned to this TaxID and descendants in RefSeq */ | |
2203 private IntLongHashMap refseqSizeMapC; | |
2204 /** Number of sequences assigned to this TaxID in RefSeq */ | |
2205 private IntHashMap refseqSeqMap; | |
2206 /** Number of sequences assigned to this TaxID and descendants in RefSeq */ | |
2207 private IntLongHashMap refseqSeqMapC; | |
2208 /** Number of descendant nodes, inclusive, for each TaxID */ | |
2209 private IntHashMap nodeMapC; | |
2210 | |
2211 /*--------------------------------------------------------------*/ | |
2212 /*---------------- Statics ----------------*/ | |
2213 /*--------------------------------------------------------------*/ | |
2214 | |
2215 /** Assign levels to unranked nodes below species level, when possible */ | |
2216 public static boolean assignStrains=true; | |
2217 /** Assume headers are in Silva format */ | |
2218 public static boolean SILVA_MODE=false; | |
2219 /** Assume headers are in Unite format */ | |
2220 public static boolean UNITE_MODE=false; | |
2221 /** Probably unnecessary at this point... present for legacy reasons */ | |
2222 public static boolean CRASH_IF_NO_GI_TABLE=true; | |
2223 | |
2224 public static boolean verbose=false; | |
2225 public static boolean SHOW_WARNINGS=false; | |
2226 | |
2227 /** Maps IMG IDs to records from the dump file */ | |
2228 private static HashMap<Long, ImgRecord> imgMap; | |
2229 | |
2230 /** Set to false if the tree is expected to be mutated. | |
2231 * @TODO Remove mutable fields from the tree (like counters). | |
2232 */ | |
2233 public static boolean ALLOW_SHARED_TREE=true; | |
2234 | |
2235 /** Universal location of the shared TaxTree used by various classes */ | |
2236 private static TaxTree sharedTree; | |
2237 | |
2238 /** A simpler and probably less safe version of sharedTree(...) */ | |
2239 public static TaxTree getTree(){return sharedTree;} | |
2240 | |
2241 /** | |
2242 * Fetch the shared tree, loading it from file if not present. | |
2243 * @return A tree. | |
2244 * @TODO: Check proper-construction of double-checked synchronize | |
2245 */ | |
2246 private static TaxTree sharedTree(String fname, boolean hashNames, boolean hashDotFormat, PrintStream outstream) { | |
2247 if(!ALLOW_SHARED_TREE){return null;} | |
2248 if(sharedTree==null && fname!=null){ | |
2249 if("auto".equalsIgnoreCase(fname)){fname=defaultTreeFile();} | |
2250 synchronized(TaxTree.class){ | |
2251 if(sharedTree==null){ | |
2252 if(outstream!=null){outstream.println("Loading tax tree.");} | |
2253 Timer t=new Timer(outstream, false); | |
2254 setSharedTree(ReadWrite.read(TaxTree.class, fname, true), hashNames, hashDotFormat); | |
2255 t.stopAndPrint(); | |
2256 } | |
2257 } | |
2258 } | |
2259 if(hashNames && sharedTree.nameMap==null){ | |
2260 synchronized(sharedTree){ | |
2261 if(sharedTree.nameMap==null){ | |
2262 if(outstream!=null){outstream.println("Hashing names.");} | |
2263 Timer t=new Timer(outstream, false); | |
2264 sharedTree.hashNames(hashDotFormat); | |
2265 t.stopAndPrint(); | |
2266 } | |
2267 } | |
2268 } | |
2269 return sharedTree; | |
2270 } | |
2271 | |
2272 /** | |
2273 * For initialization. Normally only one tree is needed by a process so it is set here. | |
2274 * If the tree is already set nothing will happen, unless additional hashing is needed. | |
2275 */ | |
2276 private static synchronized void setSharedTree(TaxTree tree, boolean hashNames, boolean hashDotFormat){ | |
2277 assert(ALLOW_SHARED_TREE); | |
2278 assert(sharedTree==null); | |
2279 sharedTree=tree; | |
2280 if(hashNames && sharedTree.nameMap==null){ | |
2281 synchronized(sharedTree){ | |
2282 if(sharedTree.nameMap==null){ | |
2283 sharedTree.hashNames(hashDotFormat); | |
2284 } | |
2285 } | |
2286 } | |
2287 } | |
2288 | |
2289 /** | |
2290 * Determine whether a taxonomic level is standard. e.g.:<br> | |
2291 * isSimple("phylum")=true<br> | |
2292 * isSimple("subphylum")=false<br> | |
2293 * isSimple("no-rank")=false | |
2294 * @param levelExtended The extended level to test. | |
2295 * @return True if this level is not no-rank, and the names of the normal and extended levels match. | |
2296 */ | |
2297 public static boolean isSimple(int levelExtended){ | |
2298 int level=extendedToLevel(levelExtended); | |
2299 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level)); | |
2300 } | |
2301 | |
2302 /** | |
2303 * Determine whether a taxonomic level is standard, but allows substrain and lower. e.g.:<br> | |
2304 * isSimple("phylum")=true<br> | |
2305 * isSimple("substrain")=true<br> | |
2306 * isSimple("subphylum")=false<br> | |
2307 * isSimple("no-rank")=false | |
2308 * @param levelExtended The extended level to test. | |
2309 * @return True if this level is not no-rank, and the names of the normal and extended levels match. | |
2310 */ | |
2311 public static boolean isSimple2(int levelExtended){ | |
2312 int level=extendedToLevel(levelExtended); | |
2313 return levelExtended!=NO_RANK_E && (levelExtended==levelToExtended(level) | |
2314 || levelExtended==STRAIN_E || levelExtended==SUBSPECIES_E || levelExtended==SUBSTRAIN_E); | |
2315 } | |
2316 | |
2317 /*--------------------------------------------------------------*/ | |
2318 /*---------------- Constants ----------------*/ | |
2319 /*--------------------------------------------------------------*/ | |
2320 | |
2321 /** Get the number for the normal level of this name */ | |
2322 public static final int stringToLevel(String s){return altLevelMap.get(s);} | |
2323 public static final boolean levelMapExtendedContains(String s){return levelMapExtended.containsKey(s);} | |
2324 /** Get the number for the extended level of this name */ | |
2325 public static final int stringToLevelExtended(String s){return levelMapExtended.get(s);} | |
2326 /** Get the normal name for this normal level */ | |
2327 public static final String levelToString(int x){return taxLevelNames[x];} | |
2328 /** Get the extended name for this extended level */ | |
2329 public static final String levelToStringExtended(int x){return taxLevelNamesExtended[x];} | |
2330 /** Get the abbreviated name for this normal level */ | |
2331 public static final String levelToStringShort(int x){return taxLevelNamesShort[x];} | |
2332 | |
2333 /** Normal, aka canonical, aka simple tax level names */ | |
2334 private static final String[] taxLevelNames=new String[] { | |
2335 "no rank", "subspecies", "species", "genus", | |
2336 "family", "order", "class", "phylum", | |
2337 "kingdom", "superkingdom", "domain", "life" | |
2338 }; | |
2339 public static final int numTaxLevelNames=taxLevelNames.length; | |
2340 | |
2341 /** | |
2342 * Definitive representation of all NCBI taxonomic level names. | |
2343 * All levels used by NCBI must be present here, or parsing a new NCBI tax tree will crash. | |
2344 * The first dimension maps normal ranks to extended ranks. | |
2345 * Both dimensions are ordered ascending. | |
2346 * @TODO Note! If this goes over 63 names it will cause a problem with getDefinedLevels(). | |
2347 */ | |
2348 //TODO See @TODO | |
2349 private static final String[][] taxLevelNamesExtendedMatrix=new String[][] { | |
2350 {"no rank"}, | |
2351 {"subgenotype", "genotype", "substrain", "isolate", "strain", "pathotype", "pathogroup", | |
2352 "biotype", "serotype", "serogroup", "morph", "forma specialis", "forma", "subvariety", "varietas", | |
2353 "subspecies"}, | |
2354 {"species"}, | |
2355 {"species subgroup", "species group", "series", "subsection", "section", "subgenus", "genus"}, | |
2356 {"subtribe", "tribe", "subfamily", "family"}, | |
2357 {"superfamily", "parvorder", "infraorder", "suborder", "order"}, | |
2358 {"superorder", "subcohort", "cohort", "infraclass", "subclass", "class"}, | |
2359 {"superclass", "subdivision", "division", "subphylum", "phylum"}, | |
2360 {"superphylum", "subkingdom", "kingdom"}, | |
2361 {"superkingdom"}, | |
2362 {"domain"}, | |
2363 {"life"} | |
2364 }; | |
2365 | |
2366 /** Extended tax level names as a 1D array */ | |
2367 private static final String[] taxLevelNamesExtended=makeNamesExtended(); | |
2368 /** Number of extended tax levels */ | |
2369 public static final int numTaxLevelNamesExtended=taxLevelNamesExtended.length; | |
2370 | |
2371 /** Flatten the extended tax level names matrix to a 1D array */ | |
2372 private static final String[] makeNamesExtended(){ | |
2373 ArrayList<String> list=new ArrayList<String>(); | |
2374 for(String[] s : taxLevelNamesExtendedMatrix){ | |
2375 for(String ss : s){ | |
2376 list.add(ss); | |
2377 } | |
2378 } | |
2379 return list.toArray(new String[0]); | |
2380 } | |
2381 | |
2382 /** Abbreviations of tax level names, mainly for semicolon form */ | |
2383 private static final String[] taxLevelNamesShort=new String[] { | |
2384 "nr", "ss", "s", "g", | |
2385 "f", "o", "c", "p", | |
2386 "k", "sk", "d", "l" | |
2387 }; | |
2388 | |
2389 /** Normal tax level numbers as constants */ | |
2390 public static final int NO_RANK=0, SUBSPECIES=1, SPECIES=2, GENUS=3, | |
2391 FAMILY=4, ORDER=5, CLASS=6, PHYLUM=7, KINGDOM=8, SUPERKINGDOM=9, DOMAIN=10, LIFE=11; | |
2392 | |
2393 /** TaxID of Life node */ | |
2394 public static final int LIFE_ID=1; | |
2395 /** TaxID of Cellular Organisms node */ | |
2396 public static final int CELLULAR_ORGANISMS_ID=131567; | |
2397 /** TaxID of Bacteria node */ | |
2398 public static final int BACTERIA_ID=2; //Is this safe? Who knows... | |
2399 /** TaxID of Archaea node */ | |
2400 public static final int ARCHAEA_ID=2157; | |
2401 /** TaxID of Euk node */ | |
2402 public static final int EUKARYOTA_ID=2759; | |
2403 /** TaxID of Animal node */ | |
2404 public static final int METAZOA_ID=33208, ANIMALIA_ID=33208; | |
2405 /** TaxID of Plant node */ | |
2406 public static final int VIRIDIPLANTAE_ID=33090, PLANTAE_ID=33090; | |
2407 /** TaxID of Fungi node */ | |
2408 public static final int FUNGI_ID=4751; | |
2409 /** TaxID of Virus node */ | |
2410 public static final int VIRUSES_ID=10239; | |
2411 /** TaxID of Viroids node (now defunct) */ | |
2412 public static final int VIROIDS_ID=12884; | |
2413 | |
2414 /** Maps normal level names to normal level numbers */ | |
2415 private static final HashMap<String, Integer> levelMap=makeLevelMap(); | |
2416 /** Maps extended level names to extended level numbers */ | |
2417 private static final HashMap<String, Integer> levelMapExtended=makeLevelMapExtended(); | |
2418 /** Maps extended level names to normal level numbers */ | |
2419 private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap(); | |
2420 | |
2421 /** Common extended level numbers as constants */ | |
2422 public static final int NO_RANK_E=NO_RANK, | |
2423 SUBSTRAIN_E=stringToLevelExtended("substrain"), STRAIN_E=stringToLevelExtended("strain"), | |
2424 SUBSPECIES_E=stringToLevelExtended("subspecies"), | |
2425 SPECIES_E=stringToLevelExtended("species"), GENUS_E=stringToLevelExtended("genus"), | |
2426 FAMILY_E=stringToLevelExtended("family"), ORDER_E=stringToLevelExtended("order"), | |
2427 CLASS_E=stringToLevelExtended("class"), PHYLUM_E=stringToLevelExtended("phylum"), | |
2428 KINGDOM_E=stringToLevelExtended("kingdom"), SUPERKINGDOM_E=stringToLevelExtended("superkingdom"), | |
2429 DOMAIN_E=stringToLevelExtended("domain"), LIFE_E=stringToLevelExtended("life"); | |
2430 | |
2431 /** Map of normal to extended level numbers */ | |
2432 private static final int[] levelToExtended=new int[] { | |
2433 NO_RANK_E, SUBSPECIES_E, SPECIES_E, GENUS_E, FAMILY_E, | |
2434 ORDER_E, CLASS_E, PHYLUM_E, KINGDOM_E, SUPERKINGDOM_E, DOMAIN_E, LIFE_E | |
2435 }; | |
2436 | |
2437 /** Map of extended to normal level numbers */ | |
2438 private static final int[] extendedToLevel=makeExtendedToLevel(); | |
2439 | |
2440 /** Creates extendedToLevel from taxaNamesExtendedMatrix during initialization. */ | |
2441 private static int[] makeExtendedToLevel(){ | |
2442 int len=0; | |
2443 for(String[] array : taxLevelNamesExtendedMatrix){ | |
2444 len+=array.length; | |
2445 } | |
2446 int[] ret=new int[len]; | |
2447 | |
2448 int pos=0; | |
2449 for(int level=0; level<taxLevelNamesExtendedMatrix.length; level++){ | |
2450 String[] array=taxLevelNamesExtendedMatrix[level]; | |
2451 for(int i=0; i<array.length; i++){ | |
2452 ret[pos]=level; | |
2453 pos++; | |
2454 } | |
2455 } | |
2456 return ret; | |
2457 } | |
2458 | |
2459 /** Convert a standard level number (like KINGDOM) to extended (like KINGDOM_E). */ | |
2460 public static final int levelToExtended(int level){ | |
2461 return level<0 ? level : levelToExtended[level]; | |
2462 } | |
2463 | |
2464 /** Convert an extended level number (like PHYLUM_E) to extended (like PHYLUM). | |
2465 * Non-standard levels will be converted to the next higher standard level; | |
2466 * e.g., subphylum -> phylum */ | |
2467 public static final int extendedToLevel(int extended){ | |
2468 return extended<0 ? -1 : extendedToLevel[extended]; | |
2469 } | |
2470 | |
2471 /* Pre-compiled delimiters to save time when splitting lines */ | |
2472 private static final Pattern delimiterTab = Pattern.compile("\t"); | |
2473 private static final Pattern delimiter = Pattern.compile("\t\\|\t"); | |
2474 private static final Pattern delimiterPipe = Pattern.compile("\\|"); | |
2475 private static final Pattern delimiterTilde = Pattern.compile("\\~"); | |
2476 private static final Pattern delimiter2 = Pattern.compile("[\\s_]+"); | |
2477 | |
2478 public static boolean IMG_HQ=false; | |
2479 | |
2480 /* For these fields, see the corresponding functions, below. | |
2481 * They define the default paths to various data on NERSC. */ | |
2482 | |
2483 private static final String defaultTaxPathNersc="/global/cfs/cdirs/bbtools/tax/latest"; | |
2484 private static final String defaultTaxPathAws="/test1/tax/latest"; | |
2485 private static final String defaultTaxPathIGBVM="/data/tax/latest"; | |
2486 private static final String default16SFileNersc="/global/cfs/cdirs/bbtools/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; | |
2487 private static final String default16SFileAws="/test1/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; | |
2488 private static final String default16SFileIGBVM="/data/sketch/silva/16S_consensus_with_silva_maxns10_taxsorted.fa.gz"; | |
2489 private static final String default18SFileNersc="/global/cfs/cdirs/bbtools/silva/18S_consensus_silva_maxns10_taxsorted.fa.gz"; | |
2490 private static final String default18SFileAws="/test1/18S_consensus_silva_maxns10_taxsorted.fa.gz"; | |
2491 private static final String default18SFileIGBVM="/data/sketch/silva/18S_consensus_with_silva_maxns10_taxsorted.fa.gz"; | |
2492 | |
2493 private static final String defaultImgFile="TAX_PATH/imgDump.txt"; | |
2494 private static final String defaultTableFileInt="TAX_PATH/gitable.int1d.gz"; | |
2495 private static final String defaultTableFile="TAX_PATH/gitable.int2d.gz"; | |
2496 private static final String defaultTreeFile="TAX_PATH/tree.taxtree.gz"; | |
2497 private static final String defaultPatternFile="TAX_PATH/patterns.txt"; | |
2498 private static final String defaultSizeFile="TAX_PATH/taxsize.tsv.gz"; | |
2499 | |
2500 private static final String defaultAccessionFile= | |
2501 //"TAX_PATH/shrunk.protF.accession2taxid.gz," + | |
2502 "TAX_PATH/shrunk.prot.accession2taxid.gz," | |
2503 + "TAX_PATH/shrunk.nucl_wgs.accession2taxid.gz," | |
2504 + "TAX_PATH/shrunk.nucl_gb.accession2taxid.gz," | |
2505 + "TAX_PATH/shrunk.dead_prot.accession2taxid.gz," | |
2506 // + "TAX_PATH/shrunk.nucl_est.accession2taxid.gz," | |
2507 + "TAX_PATH/shrunk.dead_wgs.accession2taxid.gz," | |
2508 // + "TAX_PATH/shrunk.nucl_gss.accession2taxid.gz," | |
2509 + "TAX_PATH/shrunk.dead_nucl.accession2taxid.gz," | |
2510 + "TAX_PATH/shrunk.pdb.accession2taxid.gz"; | |
2511 | |
2512 /** For setting TAX_PATH, the root to taxonomy files */ | |
2513 public static final String defaultTaxPath(){ | |
2514 return (Shared.AWS && !Shared.NERSC) ? defaultTaxPathAws : Shared.IGBVM ? defaultTaxPathIGBVM : defaultTaxPathNersc; | |
2515 } | |
2516 | |
2517 /** 16S consensus sequences per TaxID */ | |
2518 public static final String default16SFile(){ | |
2519 return (Shared.AWS && !Shared.NERSC) ? default16SFileAws : Shared.IGBVM ? default16SFileIGBVM : default16SFileNersc; | |
2520 } | |
2521 | |
2522 /** 18S consensus sequences per TaxID */ | |
2523 public static final String default18SFile(){ | |
2524 return (Shared.AWS && !Shared.NERSC) ? default18SFileAws : Shared.IGBVM ? default18SFileIGBVM : default18SFileNersc; | |
2525 } | |
2526 | |
2527 /** Path to all taxonomy files, substituted in to make specific file paths */ | |
2528 public static String TAX_PATH=defaultTaxPath(); | |
2529 | |
2530 /** Location of gitable.int2d.gz for gi lookups */ | |
2531 public static final String defaultTableFile(){return defaultTableFile.replaceAll("TAX_PATH", TAX_PATH);} | |
2532 /** Location of tree.taxtree.gz */ | |
2533 public static final String defaultTreeFile(){return defaultTreeFile.replaceAll("TAX_PATH", TAX_PATH);} | |
2534 | |
2535 //Use the prot.FULL.gz ncbi file. | |
2536 public static boolean protFull=false; | |
2537 | |
2538 /** Location of shrunk.*.accession2taxid.gz (all accession files, comma-delimited) */ | |
2539 public static final String defaultAccessionFile(){ | |
2540 String s=(protFull ? "TAX_PATH/shrunk.protF.accession2taxid.gz," : "")+defaultAccessionFile; | |
2541 return s.replaceAll("TAX_PATH", TAX_PATH); | |
2542 } | |
2543 /** Location of patterns.txt, which holds information about observed accession string formats */ | |
2544 public static final String defaultPatternFile(){return defaultPatternFile.replaceAll("TAX_PATH", TAX_PATH);} | |
2545 /** Location of imgDump.txt, which translates IMG to NCBI IDs for internal JGI use */ | |
2546 public static final String defaultImgFile(){return defaultImgFile.replaceAll("TAX_PATH", TAX_PATH);} | |
2547 /** Location of taxsize.tsv, which indicates the amount of sequence associated with a TaxID */ | |
2548 public static final String defaultSizeFile(){return defaultSizeFile.replaceAll("TAX_PATH", TAX_PATH);} | |
2549 | |
2550 /** Screen output gets printed here */ | |
2551 private static PrintStream outstream=System.out; | |
2552 | |
2553 } |