comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/GiToTaxidInt.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package tax;
2
3 import java.io.File;
4 import java.util.ArrayList;
5
6 import fileIO.ByteFile;
7 import fileIO.ReadWrite;
8 import shared.Parse;
9 import shared.Shared;
10 import shared.Tools;
11 import structures.IntList;
12
13 /**
14 * @author Brian Bushnell
15 * @date Mar 10, 2015
16 *
17 */
18 public class GiToTaxidInt {
19
20 public static void main(String[] args){
21 ReadWrite.USE_UNPIGZ=true;
22 ReadWrite.USE_PIGZ=true;
23 ReadWrite.ZIPLEVEL=9;
24 ReadWrite.PIGZ_BLOCKSIZE=256;
25 // ReadWrite.PIGZ_ITERATIONS=30;
26
27 for(String arg : args){
28 String[] split=arg.split("=");
29 String a=split[0].toLowerCase();
30 String b=split.length>1 ? split[1] : null;
31 shared.Parser.parseZip(arg, a, b);
32 }
33 // if(args.length>2 && false){//Run a test
34 // test(args);
35 // }else
36 if(args.length>=2){//Write array
37 initialize(args[0]);
38 ReadWrite.write(array, args[1], true);
39 }
40 }
41
42 public static void test(String[] args){
43 System.err.println(getID(1000));
44 System.err.println(getID(10000));
45 System.err.println(getID(10001));
46 System.err.println(getID(10002));
47 System.err.println(getID(10003));
48 System.err.println(getID(10004));
49 System.err.println(getID(10005));
50 System.err.println(getID(100000));
51 System.err.println(getID(1000000));
52 System.err.println(getID(10000000));
53
54 TaxTree tree=null;
55 if(args.length>1){
56 tree=TaxTree.loadTaxTree(args[0], System.err, true, true);
57 }
58
59 System.err.println("Strings:");
60 int x;
61 x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44");
62 System.err.println(x);
63 if(tree!=null){
64 System.err.println(tree.getNode(x));
65 tree.incrementRaw(x, 30);
66 }
67 x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds");
68 System.err.println(x);
69 if(tree!=null){
70 System.err.println(tree.getNode(x));
71 tree.incrementRaw(x, 40);
72 }
73 x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence");
74 System.err.println(x);
75 if(tree!=null){
76 System.err.println(tree.getNode(x));
77 tree.incrementRaw(x, 20);
78 }
79
80 if(tree!=null){
81 tree.percolateUp();
82 ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35);
83 for(TaxNode n : nodes){
84 System.err.println(n);
85 }
86 }
87 }
88
89 public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
90 public static int parseGiToTaxid(String s, char delimiter){
91 int x=parseGiNumber(s, delimiter);
92 assert(x>=0) : s;
93 assert(array!=null) : "To use gi numbers, you must load a gi table.";
94 // if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
95 if(x>=0 && x<array.length){return array[x];}
96 assert(x<array.length) : "The GI number "+x+" is too big.\n"
97 + "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
98 + "To ignore this problem, please run with the -da flag.\n";
99 return -1;
100 }
101
102
103 public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
104 public static int parseGiToTaxid(byte[] s, char delimiter){
105 long x=parseGiNumber(s, delimiter);
106 if(x>=0 && x<array.length){return array[(int)x];}
107 if(x<0){return -1;}
108 assert(false) : x;
109 return -1;
110 }
111
112 /** Parse a gi number, or return -1 if formatted incorrectly. */
113 static int parseGiNumber(String s, char delimiter){
114 if(s==null || s.length()<4){return -1;}
115 // System.err.println("a");
116 if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
117 // System.err.println("b");
118 if(!s.startsWith("gi")){return -1;}
119 // System.err.println("c");
120 // System.err.println("d");
121 int initial=s.indexOf(delimiter);
122 // System.err.println("e");
123 if(initial<0){
124 if(delimiter!='~'){
125 delimiter='~';
126 initial=s.indexOf(delimiter);
127 }
128 if(initial<0){
129 delimiter='_';
130 initial=s.indexOf(delimiter);
131 }
132 if(initial<0){return -1;}
133 // System.err.println("f");
134 // System.err.println("g");
135 }
136 // System.err.println("h");
137 if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
138 // System.err.println("i");
139
140 int number=0;
141 for(int i=initial+1; i<s.length(); i++){
142 char c=s.charAt(i);
143 if(c==delimiter){break;}
144 assert(Tools.isDigit(c));
145 number=(number*10)+(c-'0');
146 }
147 // System.err.println("j: "+number);
148 return number;
149 }
150
151 /** Parse a ncbi number, or return -1 if formatted incorrectly. */
152 public static int parseTaxidNumber(String s, char delimiter){
153 if(s==null || s.length()<5){return -1;}
154 if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);}
155 if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;}
156 int initial=s.indexOf(delimiter);
157 if(initial<0){
158 delimiter='_';
159 initial=s.indexOf(delimiter);
160 if(initial<0){return -1;}
161 }
162 if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
163
164 int number=0;
165 for(int i=initial+1; i<s.length(); i++){
166 char c=s.charAt(i);
167 if(c==delimiter || c==' '){break;}
168 assert(Tools.isDigit(c)) : c+"\n"+s;
169 number=(number*10)+(c-'0');
170 }
171 return number;
172 }
173
174
175 public static int getID(String s){return getID(s, '|');}
176 /** Get the taxID from a header starting with a taxID or gi number */
177 public static int getID(String s, char delimiter){
178 int x=parseTaxidNumber(s, delimiter);
179 if(x>=0){return x;}
180 x=parseGiNumber(s, delimiter);
181 if(x>=0){return array[x];}
182 return -1;
183 }
184
185 /** Parse a gi number, or return -1 if formatted incorrectly. */
186 static int parseGiNumber(byte[] s, char delimiter){
187 if(s==null || s.length<4){return -1;}
188 if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
189 int initial=Tools.indexOf(s, (byte)delimiter);
190 if(initial<0){
191 delimiter='_';
192 initial=Tools.indexOf(s, (byte)delimiter);
193 if(initial<0){return -1;}
194 }
195 if(!Tools.isDigit(s[initial+1])){return -1;}
196
197 long number=0;
198 for(int i=initial+1; i<s.length; i++){
199 byte c=s[i];
200 if(c==delimiter){break;}
201 assert(Tools.isDigit(c));
202 number=(number*10)+(c-'0');
203 }
204 return (int)number;
205 }
206
207 /** Parse a gi number, or return -1 if formatted incorrectly. */
208 static int parseNcbiNumber(byte[] s, char delimiter){
209 if(s==null || s.length<3){return -1;}
210 if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;}
211 int initial=Tools.indexOf(s, (byte)delimiter);
212 if(initial<0){
213 delimiter='_';
214 initial=Tools.indexOf(s, (byte)delimiter);
215 if(initial<0){return -1;}
216 }
217 if(!Tools.isDigit(s[initial+1])){return -1;}
218
219 int number=0;
220 for(int i=initial+1; i<s.length; i++){
221 byte c=s[i];
222 if(c==delimiter){break;}
223 assert(Tools.isDigit(c));
224 number=(number*10)+(c-'0');
225 }
226 return number;
227 }
228
229 public static int getID(byte[] s){return getID(s, '|');}
230 /** Get the taxID from a header starting with a taxID or gi number */
231 public static int getID(byte[] s, char delimiter){
232 int x=parseGiNumber(s, delimiter);
233 if(x>=0){return array[x];}
234 return parseNcbiNumber(s, delimiter);
235 }
236
237 /** Get the taxID from a gi number */
238 public static int getID(long gi){
239 assert(gi>=0) : gi;
240 assert(gi<Integer.MAX_VALUE) : gi+" > "+Integer.MAX_VALUE;
241 assert(gi<array.length) : gi+", "+array.length;
242 return array[(int)gi];
243 }
244
245 public static void initialize(String fname){
246 assert(fname!=null);
247 if(fileString==null || !fileString.equals(fname)){
248 synchronized(GiToTaxid.class){
249 if(!initialized || fileString==null || !fileString.equals(fname)){
250 fileString=fname;
251 if(fname.contains(".int1d")){
252 array=ReadWrite.read(int[].class, fname, true);
253 }else{
254 array=makeArray(fname);
255 }
256 }
257 initialized=true;
258 }
259 }
260 }
261
262 public static boolean isInitialized(){return initialized;}
263
264 public static synchronized void unload(){
265 array=null;
266 fileString=null;
267 initialized=false;
268 }
269
270 private static int[] makeArray(String fnames){
271 String[] split;
272 if(new File(fnames).exists()){split=new String[] {fnames};}
273 else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
274 else if(fnames.indexOf('#')>=0){
275 assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
276 File dir=new File(System.getProperty("user.dir"));
277 String prefix=fnames.substring(0, fnames.indexOf('#'));
278 String suffix=fnames.substring(fnames.indexOf('#')+1);
279
280 File[] array=dir.listFiles();
281 StringBuilder sb=new StringBuilder();
282 String comma="";
283 for(File f : array){
284 String s=f.getName();
285 if(s.startsWith(prefix) && s.startsWith(suffix)){
286 sb.append(comma);
287 sb.append(s);
288 comma=",";
289 }
290 }
291 split=sb.toString().split(",");
292 }else{
293 throw new RuntimeException("Invalid file: "+fnames);
294 }
295
296 IntList list=new IntList();
297 // assert(max<Integer.MAX_VALUE) : "Overflow.";
298 // int[] x=new int[(int)max+1];
299 // Arrays.fill(x, -1);
300
301 long total=0;
302 for(String s : split){
303 long count=addToList(s, list);
304 total+=count;
305 }
306 return list.shrink().array;
307 }
308
309 private static long addToList(String fname, IntList list){
310 boolean warned=false;
311 ByteFile bf=ByteFile.makeByteFile(fname, true);
312 long count=0, invalid=0;
313 byte[] line=bf.nextLine();
314 while(line!=null){
315 if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na
316 count++;
317 int tab2=Tools.indexOfNth(line, '\t', 2);
318 int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1);
319 assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
320 assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
321 assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
322 // assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
323 int tid=Parse.parseInt(line, tab2+1, tab3);
324 int gi=Parse.parseInt(line, tab3+1, line.length);
325 if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
326 invalid++;
327 }else{
328 assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
329 int old=list.get(gi);
330 assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
331
332 list.set(gi, tid);
333
334 //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
335 // if(x[gi]!=-1 && x[gi]!=ncbi){
336 // if(!warned){
337 // System.err.println("***WARNING*** For file "+fname+":\n"+
338 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
339 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
340 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
341 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
342 // warned=true;
343 // }
344 // }else{
345 // x[gi]=ncbi;
346 // }
347 }
348 }else{
349 if(line.length==0){System.err.println(fname+", "+count);}//debug
350 invalid++;
351 }
352 line=bf.nextLine();
353 }
354 if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);}
355 bf.close();
356 return count;
357 }
358
359 // private static int[] makeArrayOld(String fnames){
360 // String[] split;
361 // if(new File(fnames).exists()){split=new String[] {fnames};}
362 // else{split=fnames.split(",");}
363 //
364 // long max=0;
365 // for(String s : split){
366 // max=Tools.max(max, findMaxID(s));
367 // }
368 //
369 // assert(max<Integer.MAX_VALUE) : "Overflow.";
370 // int[] x=new int[(int)max+1];
371 // Arrays.fill(x, -1);
372 //
373 // long total=0;
374 // for(String s : split){
375 // long count=fillArray(s, x);
376 // total+=count;
377 // }
378 // return x;
379 // }
380 //
381 // private static long findMaxID(String fname){
382 // ByteFile bf=ByteFile.makeByteFile(fname, true);
383 // long count=0, max=0;
384 // byte[] line=bf.nextLine();
385 // while(line!=null){
386 // count++;
387 // int tab=Tools.indexOf(line, (byte)'\t');
388 // long gi=Parse.parseLong(line, 0, tab);
389 // max=Tools.max(max, gi);
390 // line=bf.nextLine();
391 // }
392 // bf.close();
393 // return max;
394 // }
395 //
396 // private static long fillArray(String fname, int[] x){
397 // boolean warned=false;
398 // ByteFile bf=ByteFile.makeByteFile(fname, true);
399 // long count=0;
400 // byte[] line=bf.nextLine();
401 // while(line!=null){
402 // count++;
403 // int tab=Tools.indexOf(line, (byte)'\t');
404 // int gi=Parse.parseInt(line, 0, tab);
405 // int ncbi=Parse.parseInt(line, tab+1, line.length);
406 // //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
407 // if(x[gi]!=-1 && x[gi]!=ncbi){
408 // if(!warned){
409 // System.err.println("***WARNING*** For file "+fname+":\n"+
410 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
411 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
412 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
413 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
414 // warned=true;
415 // }
416 // }else{
417 // x[gi]=ncbi;
418 // }
419 // line=bf.nextLine();
420 // }
421 // if(verbose){System.err.println("Count: "+count);}
422 // bf.close();
423 // return count;
424 // }
425
426 private static int[] array;
427 private static String fileString;
428
429 public static boolean verbose=false;
430 private static boolean initialized=false;
431 }