Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/GiToTaxid.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package tax; | |
2 | |
3 import java.io.File; | |
4 import java.util.ArrayList; | |
5 | |
6 import fileIO.ByteFile; | |
7 import fileIO.ReadWrite; | |
8 import shared.Parse; | |
9 import shared.Shared; | |
10 import shared.Tools; | |
11 import structures.IntList; | |
12 | |
13 /** | |
14 * @author Brian Bushnell | |
15 * @date Mar 10, 2015 | |
16 * | |
17 */ | |
18 public class GiToTaxid { | |
19 | |
20 public static void main(String[] args){ | |
21 ReadWrite.USE_UNPIGZ=true; | |
22 ReadWrite.USE_PIGZ=true; | |
23 ReadWrite.ZIPLEVEL=9; | |
24 ReadWrite.PIGZ_BLOCKSIZE=256; | |
25 // ReadWrite.PIGZ_ITERATIONS=30; | |
26 | |
27 for(String arg : args){ | |
28 String[] split=arg.split("="); | |
29 String a=split[0].toLowerCase(); | |
30 String b=split.length>1 ? split[1] : null; | |
31 shared.Parser.parseZip(arg, a, b); | |
32 } | |
33 // if(args.length>2 && false){//Run a test | |
34 // test(args); | |
35 // }else | |
36 if(args.length>=2){//Write array | |
37 initialize(args[0]); | |
38 ReadWrite.write(array, args[1], true); | |
39 } | |
40 } | |
41 | |
42 public static void test(String[] args){ | |
43 System.err.println(getID(1000)); | |
44 System.err.println(getID(10000)); | |
45 System.err.println(getID(10001)); | |
46 System.err.println(getID(10002)); | |
47 System.err.println(getID(10003)); | |
48 System.err.println(getID(10004)); | |
49 System.err.println(getID(10005)); | |
50 System.err.println(getID(100000)); | |
51 System.err.println(getID(1000000)); | |
52 System.err.println(getID(10000000)); | |
53 | |
54 TaxTree tree=null; | |
55 if(args.length>1){ | |
56 tree=TaxTree.loadTaxTree(args[0], System.err, true, true); | |
57 } | |
58 | |
59 System.err.println("Strings:"); | |
60 int x; | |
61 x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44"); | |
62 System.err.println(x); | |
63 if(tree!=null){ | |
64 System.err.println(tree.getNode(x)); | |
65 tree.incrementRaw(x, 30); | |
66 } | |
67 x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds"); | |
68 System.err.println(x); | |
69 if(tree!=null){ | |
70 System.err.println(tree.getNode(x)); | |
71 tree.incrementRaw(x, 40); | |
72 } | |
73 x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence"); | |
74 System.err.println(x); | |
75 if(tree!=null){ | |
76 System.err.println(tree.getNode(x)); | |
77 tree.incrementRaw(x, 20); | |
78 } | |
79 | |
80 if(tree!=null){ | |
81 tree.percolateUp(); | |
82 ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35); | |
83 for(TaxNode n : nodes){ | |
84 System.err.println(n); | |
85 } | |
86 } | |
87 } | |
88 | |
89 public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');} | |
90 public static int parseGiToTaxid(String s, char delimiter){ | |
91 long x=parseGiNumber(s, delimiter); | |
92 assert(x>=0) : x+", "+s; | |
93 return getID(x); | |
94 } | |
95 | |
96 | |
97 public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');} | |
98 public static int parseGiToTaxid(byte[] s, char delimiter){ | |
99 long x=parseGiNumber(s, delimiter); | |
100 return x<0 ? -1 : getID(x); | |
101 } | |
102 | |
103 /** Parse a gi number, or return -1 if formatted incorrectly. */ | |
104 static long parseGiNumber(String s, char delimiter){ | |
105 if(s==null || s.length()<4){return -1;} | |
106 if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);} | |
107 if(!s.startsWith("gi")){return -1;} | |
108 int initial=s.indexOf(delimiter); | |
109 if(initial<0){ | |
110 if(delimiter!='~'){ | |
111 delimiter='~'; | |
112 initial=s.indexOf(delimiter); | |
113 } | |
114 if(initial<0){ | |
115 delimiter='_'; | |
116 initial=s.indexOf(delimiter); | |
117 } | |
118 if(initial<0){return -1;} | |
119 } | |
120 if(!Tools.isDigit(s.charAt(initial+1))){return -1;} | |
121 | |
122 long number=0; | |
123 for(int i=initial+1; i<s.length(); i++){ | |
124 char c=s.charAt(i); | |
125 if(c==delimiter){break;} | |
126 assert(Tools.isDigit(c)); | |
127 number=(number*10)+(c-'0'); | |
128 } | |
129 return number; | |
130 } | |
131 | |
132 /** Parse a ncbi number, or return -1 if formatted incorrectly. */ | |
133 public static int parseTaxidNumber(String s, char delimiter){ | |
134 if(s==null || s.length()<5){return -1;} | |
135 if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);} | |
136 if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;} | |
137 int initial=s.indexOf(delimiter); | |
138 if(initial<0){ | |
139 delimiter='_'; | |
140 initial=s.indexOf(delimiter); | |
141 if(initial<0){return -1;} | |
142 } | |
143 if(!Tools.isDigit(s.charAt(initial+1))){return -1;} | |
144 | |
145 int number=0; | |
146 for(int i=initial+1; i<s.length(); i++){ | |
147 char c=s.charAt(i); | |
148 if(c==delimiter || c==' '){break;} | |
149 assert(Tools.isDigit(c)) : c+"\n"+s; | |
150 number=(number*10)+(c-'0'); | |
151 } | |
152 return number; | |
153 } | |
154 | |
155 | |
156 public static int getID(String s){return getID(s, '|');} | |
157 /** Get the taxID from a header starting with a taxID or gi number */ | |
158 public static int getID(String s, char delimiter){ | |
159 long x=parseTaxidNumber(s, delimiter); | |
160 if(x>=0){return (int)x;} | |
161 x=parseGiNumber(s, delimiter); | |
162 return x<0 ? -1 : getID(x); | |
163 } | |
164 | |
165 /** Parse a gi number, or return -1 if formatted incorrectly. */ | |
166 static long parseGiNumber(byte[] s, char delimiter){ | |
167 if(s==null || s.length<4){return -1;} | |
168 if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;} | |
169 int initial=Tools.indexOf(s, (byte)delimiter); | |
170 if(initial<0){ | |
171 delimiter='_'; | |
172 initial=Tools.indexOf(s, (byte)delimiter); | |
173 if(initial<0){return -1;} | |
174 } | |
175 if(!Tools.isDigit(s[initial+1])){return -1;} | |
176 | |
177 long number=0; | |
178 for(int i=initial+1; i<s.length; i++){ | |
179 byte c=s[i]; | |
180 if(c==delimiter){break;} | |
181 assert(Tools.isDigit(c)); | |
182 number=(number*10)+(c-'0'); | |
183 } | |
184 return number; | |
185 } | |
186 | |
187 /** Parse a gi number, or return -1 if formatted incorrectly. */ | |
188 static int parseNcbiNumber(byte[] s, char delimiter){ | |
189 if(s==null || s.length<3){return -1;} | |
190 if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;} | |
191 int initial=Tools.indexOf(s, (byte)delimiter); | |
192 if(initial<0){ | |
193 delimiter='_'; | |
194 initial=Tools.indexOf(s, (byte)delimiter); | |
195 if(initial<0){return -1;} | |
196 } | |
197 if(!Tools.isDigit(s[initial+1])){return -1;} | |
198 | |
199 int number=0; | |
200 for(int i=initial+1; i<s.length; i++){ | |
201 byte c=s[i]; | |
202 if(c==delimiter){break;} | |
203 assert(Tools.isDigit(c)); | |
204 number=(number*10)+(c-'0'); | |
205 } | |
206 return number; | |
207 } | |
208 | |
209 public static int getID(byte[] s){return getID(s, '|');} | |
210 /** Get the taxID from a header starting with a taxID or gi number */ | |
211 public static int getID(byte[] s, char delimiter){ | |
212 long x=parseGiNumber(s, delimiter); | |
213 if(x>=0){return getID(x, true);} | |
214 return parseNcbiNumber(s, delimiter); | |
215 } | |
216 | |
217 /** Get the taxID from a gi number; | |
218 * -1 if not present or invalid (negative input), | |
219 * -2 if out of range (too high) */ | |
220 public static int getID(long gi){ | |
221 return getID(gi, true); | |
222 } | |
223 | |
224 /** Get the taxID from a gi number; | |
225 * 0 if not present, | |
226 * -1 if invalid (negative input), | |
227 * -2 if out of range (too high) */ | |
228 public static int getID(long gi, boolean assertInRange){ | |
229 assert(initialized) : "To use gi numbers, you must load a gi table."; | |
230 if(gi<0 || gi>maxGiLoaded){ | |
231 assert(!assertInRange) : gi<0 ? "gi number "+gi+" is invalid." : | |
232 "The gi number "+gi+" is too big: Max loaded gi number is "+maxGiLoaded+".\n" | |
233 + "Please update the gi table with the latest version from NCBI" | |
234 + " as per the instructions in gitable.sh.\n" | |
235 + "To ignore this problem, please run with the -da flag.\n"; | |
236 return gi<0 ? -1 : -2; | |
237 } | |
238 final long upper=gi>>>SHIFT; | |
239 final int lower=(int)(gi&LOWERMASK); | |
240 assert(upper<Shared.MAX_ARRAY_LEN && upper<array.length) : gi+", "+upper+", "+array.length; | |
241 final int[] slice=array[(int)upper]; | |
242 return slice==null || slice.length<=lower ? 0 : slice[lower]; | |
243 } | |
244 | |
245 public static void initialize(String fname){ | |
246 assert(fname!=null); | |
247 if(fileString==null || !fileString.equals(fname)){ | |
248 synchronized(GiToTaxid.class){ | |
249 if(!initialized || fileString==null || !fileString.equals(fname)){ | |
250 fileString=fname; | |
251 if(fname.contains(".int2d")){ | |
252 array=ReadWrite.read(int[][].class, fname, true); | |
253 maxGiLoaded=-1; | |
254 if(array!=null && array.length>0){ | |
255 int upper=array.length-1; | |
256 int[] section=array[upper]; | |
257 int lower=section.length-1; | |
258 maxGiLoaded=(((long)upper)<<SHIFT)|lower; | |
259 } | |
260 }else if(fname.contains(".int1d")){ | |
261 throw new RuntimeException("Old gi table format filename "+fname+".\n" | |
262 + "Current files should end in .int2d."); | |
263 | |
264 }else{ | |
265 array=makeArray(fname); | |
266 } | |
267 } | |
268 initialized=true; | |
269 } | |
270 } | |
271 } | |
272 | |
273 public static boolean isInitialized(){return initialized;} | |
274 | |
275 public static synchronized void unload(){ | |
276 maxGiLoaded=-1; | |
277 array=null; | |
278 fileString=null; | |
279 initialized=false; | |
280 } | |
281 | |
282 private static int[][] makeArray(String fnames){ | |
283 String[] split; | |
284 if(new File(fnames).exists()){split=new String[] {fnames};} | |
285 else if(fnames.indexOf(',')>=0){split=fnames.split(",");} | |
286 else if(fnames.indexOf('#')>=0){ | |
287 assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for " | |
288 + "relative paths in present working directory."; | |
289 File dir=new File(System.getProperty("user.dir")); | |
290 String prefix=fnames.substring(0, fnames.indexOf('#')); | |
291 String suffix=fnames.substring(fnames.indexOf('#')+1); | |
292 | |
293 File[] array=dir.listFiles(); | |
294 StringBuilder sb=new StringBuilder(); | |
295 String comma=""; | |
296 for(File f : array){ | |
297 String s=f.getName(); | |
298 if(s.startsWith(prefix) && s.startsWith(suffix)){ | |
299 sb.append(comma); | |
300 sb.append(s); | |
301 comma=","; | |
302 } | |
303 } | |
304 split=sb.toString().split(","); | |
305 }else{ | |
306 throw new RuntimeException("Invalid file: "+fnames); | |
307 } | |
308 | |
309 int numLists=32; | |
310 IntList[] lists=new IntList[numLists]; | |
311 | |
312 long total=0; | |
313 for(String s : split){ | |
314 long count=addToList(s, lists); | |
315 total+=count; | |
316 } | |
317 for(int i=0; i<lists.length; i++){ | |
318 if(lists[i]!=null && lists[i].size>0){ | |
319 lists[i].shrink(); | |
320 numLists=i+1; | |
321 } | |
322 } | |
323 int[][] table=new int[numLists][]; | |
324 for(int i=0; i<numLists; i++){ | |
325 table[i]=lists[i].array; | |
326 } | |
327 return table; | |
328 } | |
329 | |
330 private static long addToList(String fname, IntList[] lists){ | |
331 boolean warned=false; | |
332 ByteFile bf=ByteFile.makeByteFile(fname, true); | |
333 long count=0, invalid=0; | |
334 byte[] line=bf.nextLine(); | |
335 while(line!=null){ | |
336 if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na | |
337 count++; | |
338 int tab2=Tools.indexOfNth(line, '\t', 2); | |
339 int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1); | |
340 assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length; | |
341 assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'"; | |
342 assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'"; | |
343 //assert(false) : tab2+", "+tab3+", '"+new String(line)+"'"; | |
344 int tid=Parse.parseInt(line, tab2+1, tab3); | |
345 int gi=Parse.parseInt(line, tab3+1, line.length); | |
346 if(gi<0){ | |
347 invalid++; | |
348 }else{ | |
349 assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'"; | |
350 int old=setID(gi, tid, lists); | |
351 assert(old<1 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3; | |
352 } | |
353 }else{ | |
354 //if(line.length==0){System.err.println(fname+", "+count);}//debug | |
355 invalid++; | |
356 } | |
357 line=bf.nextLine(); | |
358 } | |
359 if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);} | |
360 bf.close(); | |
361 return count; | |
362 } | |
363 | |
364 private static int getID(long gi, IntList[] lists){ | |
365 assert(gi>=0) : "gi number "+gi+" is invalid."; | |
366 final long upper=gi>>>SHIFT; | |
367 final int lower=(int)(gi&LOWERMASK); | |
368 assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper; | |
369 IntList list=lists[(int)upper]; | |
370 return lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower); | |
371 } | |
372 | |
373 private static int setID(long gi, int tid, IntList[] lists){ | |
374 assert(gi>=0) : "gi number "+gi+" is invalid."; | |
375 final long upper=gi>>>SHIFT; | |
376 final int lower=(int)(gi&LOWERMASK); | |
377 assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper; | |
378 IntList list=lists[(int)upper]; | |
379 if(list==null){list=lists[(int)upper]=new IntList();} | |
380 int old=lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower); | |
381 list.set(lower, tid); | |
382 maxGiLoaded=Tools.max(gi, maxGiLoaded); | |
383 return old; | |
384 } | |
385 | |
386 // private static int[] makeArrayOld(String fnames){ | |
387 // String[] split; | |
388 // if(new File(fnames).exists()){split=new String[] {fnames};} | |
389 // else{split=fnames.split(",");} | |
390 // | |
391 // long max=0; | |
392 // for(String s : split){ | |
393 // max=Tools.max(max, findMaxID(s)); | |
394 // } | |
395 // | |
396 // assert(max<Integer.MAX_VALUE) : "Overflow."; | |
397 // int[] x=new int[(int)max+1]; | |
398 // Arrays.fill(x, -1); | |
399 // | |
400 // long total=0; | |
401 // for(String s : split){ | |
402 // long count=fillArray(s, x); | |
403 // total+=count; | |
404 // } | |
405 // return x; | |
406 // } | |
407 // | |
408 // private static long findMaxID(String fname){ | |
409 // ByteFile bf=ByteFile.makeByteFile(fname, true); | |
410 // long count=0, max=0; | |
411 // byte[] line=bf.nextLine(); | |
412 // while(line!=null){ | |
413 // count++; | |
414 // int tab=Tools.indexOf(line, (byte)'\t'); | |
415 // long gi=Parse.parseLong(line, 0, tab); | |
416 // max=Tools.max(max, gi); | |
417 // line=bf.nextLine(); | |
418 // } | |
419 // bf.close(); | |
420 // return max; | |
421 // } | |
422 // | |
423 // private static long fillArray(String fname, int[] x){ | |
424 // boolean warned=false; | |
425 // ByteFile bf=ByteFile.makeByteFile(fname, true); | |
426 // long count=0; | |
427 // byte[] line=bf.nextLine(); | |
428 // while(line!=null){ | |
429 // count++; | |
430 // int tab=Tools.indexOf(line, (byte)'\t'); | |
431 // int gi=Parse.parseInt(line, 0, tab); | |
432 // int ncbi=Parse.parseInt(line, tab+1, line.length); | |
433 // //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi; | |
434 // if(x[gi]!=-1 && x[gi]!=ncbi){ | |
435 // if(!warned){ | |
436 // System.err.println("***WARNING*** For file "+fname+":\n"+ | |
437 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ | |
438 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" | |
439 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" | |
440 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); | |
441 // warned=true; | |
442 // } | |
443 // }else{ | |
444 // x[gi]=ncbi; | |
445 // } | |
446 // line=bf.nextLine(); | |
447 // } | |
448 // if(verbose){System.err.println("Count: "+count);} | |
449 // bf.close(); | |
450 // return count; | |
451 // } | |
452 | |
453 private static long maxGiLoaded=-1; | |
454 private static int[][] array; | |
455 private static final int SHIFT=30; | |
456 private static final long UPPERMASK=(-1L)<<SHIFT; | |
457 private static final long LOWERMASK=~UPPERMASK; | |
458 | |
459 private static String fileString; | |
460 | |
461 public static boolean verbose=false; | |
462 private static boolean initialized=false; | |
463 } |