annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/GiToTaxidInt.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package tax;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4 import java.util.ArrayList;
jpayne@68 5
jpayne@68 6 import fileIO.ByteFile;
jpayne@68 7 import fileIO.ReadWrite;
jpayne@68 8 import shared.Parse;
jpayne@68 9 import shared.Shared;
jpayne@68 10 import shared.Tools;
jpayne@68 11 import structures.IntList;
jpayne@68 12
jpayne@68 13 /**
jpayne@68 14 * @author Brian Bushnell
jpayne@68 15 * @date Mar 10, 2015
jpayne@68 16 *
jpayne@68 17 */
jpayne@68 18 public class GiToTaxidInt {
jpayne@68 19
jpayne@68 20 public static void main(String[] args){
jpayne@68 21 ReadWrite.USE_UNPIGZ=true;
jpayne@68 22 ReadWrite.USE_PIGZ=true;
jpayne@68 23 ReadWrite.ZIPLEVEL=9;
jpayne@68 24 ReadWrite.PIGZ_BLOCKSIZE=256;
jpayne@68 25 // ReadWrite.PIGZ_ITERATIONS=30;
jpayne@68 26
jpayne@68 27 for(String arg : args){
jpayne@68 28 String[] split=arg.split("=");
jpayne@68 29 String a=split[0].toLowerCase();
jpayne@68 30 String b=split.length>1 ? split[1] : null;
jpayne@68 31 shared.Parser.parseZip(arg, a, b);
jpayne@68 32 }
jpayne@68 33 // if(args.length>2 && false){//Run a test
jpayne@68 34 // test(args);
jpayne@68 35 // }else
jpayne@68 36 if(args.length>=2){//Write array
jpayne@68 37 initialize(args[0]);
jpayne@68 38 ReadWrite.write(array, args[1], true);
jpayne@68 39 }
jpayne@68 40 }
jpayne@68 41
jpayne@68 42 public static void test(String[] args){
jpayne@68 43 System.err.println(getID(1000));
jpayne@68 44 System.err.println(getID(10000));
jpayne@68 45 System.err.println(getID(10001));
jpayne@68 46 System.err.println(getID(10002));
jpayne@68 47 System.err.println(getID(10003));
jpayne@68 48 System.err.println(getID(10004));
jpayne@68 49 System.err.println(getID(10005));
jpayne@68 50 System.err.println(getID(100000));
jpayne@68 51 System.err.println(getID(1000000));
jpayne@68 52 System.err.println(getID(10000000));
jpayne@68 53
jpayne@68 54 TaxTree tree=null;
jpayne@68 55 if(args.length>1){
jpayne@68 56 tree=TaxTree.loadTaxTree(args[0], System.err, true, true);
jpayne@68 57 }
jpayne@68 58
jpayne@68 59 System.err.println("Strings:");
jpayne@68 60 int x;
jpayne@68 61 x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44");
jpayne@68 62 System.err.println(x);
jpayne@68 63 if(tree!=null){
jpayne@68 64 System.err.println(tree.getNode(x));
jpayne@68 65 tree.incrementRaw(x, 30);
jpayne@68 66 }
jpayne@68 67 x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds");
jpayne@68 68 System.err.println(x);
jpayne@68 69 if(tree!=null){
jpayne@68 70 System.err.println(tree.getNode(x));
jpayne@68 71 tree.incrementRaw(x, 40);
jpayne@68 72 }
jpayne@68 73 x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence");
jpayne@68 74 System.err.println(x);
jpayne@68 75 if(tree!=null){
jpayne@68 76 System.err.println(tree.getNode(x));
jpayne@68 77 tree.incrementRaw(x, 20);
jpayne@68 78 }
jpayne@68 79
jpayne@68 80 if(tree!=null){
jpayne@68 81 tree.percolateUp();
jpayne@68 82 ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35);
jpayne@68 83 for(TaxNode n : nodes){
jpayne@68 84 System.err.println(n);
jpayne@68 85 }
jpayne@68 86 }
jpayne@68 87 }
jpayne@68 88
jpayne@68 89 public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
jpayne@68 90 public static int parseGiToTaxid(String s, char delimiter){
jpayne@68 91 int x=parseGiNumber(s, delimiter);
jpayne@68 92 assert(x>=0) : s;
jpayne@68 93 assert(array!=null) : "To use gi numbers, you must load a gi table.";
jpayne@68 94 // if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
jpayne@68 95 if(x>=0 && x<array.length){return array[x];}
jpayne@68 96 assert(x<array.length) : "The GI number "+x+" is too big.\n"
jpayne@68 97 + "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
jpayne@68 98 + "To ignore this problem, please run with the -da flag.\n";
jpayne@68 99 return -1;
jpayne@68 100 }
jpayne@68 101
jpayne@68 102
jpayne@68 103 public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
jpayne@68 104 public static int parseGiToTaxid(byte[] s, char delimiter){
jpayne@68 105 long x=parseGiNumber(s, delimiter);
jpayne@68 106 if(x>=0 && x<array.length){return array[(int)x];}
jpayne@68 107 if(x<0){return -1;}
jpayne@68 108 assert(false) : x;
jpayne@68 109 return -1;
jpayne@68 110 }
jpayne@68 111
jpayne@68 112 /** Parse a gi number, or return -1 if formatted incorrectly. */
jpayne@68 113 static int parseGiNumber(String s, char delimiter){
jpayne@68 114 if(s==null || s.length()<4){return -1;}
jpayne@68 115 // System.err.println("a");
jpayne@68 116 if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
jpayne@68 117 // System.err.println("b");
jpayne@68 118 if(!s.startsWith("gi")){return -1;}
jpayne@68 119 // System.err.println("c");
jpayne@68 120 // System.err.println("d");
jpayne@68 121 int initial=s.indexOf(delimiter);
jpayne@68 122 // System.err.println("e");
jpayne@68 123 if(initial<0){
jpayne@68 124 if(delimiter!='~'){
jpayne@68 125 delimiter='~';
jpayne@68 126 initial=s.indexOf(delimiter);
jpayne@68 127 }
jpayne@68 128 if(initial<0){
jpayne@68 129 delimiter='_';
jpayne@68 130 initial=s.indexOf(delimiter);
jpayne@68 131 }
jpayne@68 132 if(initial<0){return -1;}
jpayne@68 133 // System.err.println("f");
jpayne@68 134 // System.err.println("g");
jpayne@68 135 }
jpayne@68 136 // System.err.println("h");
jpayne@68 137 if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
jpayne@68 138 // System.err.println("i");
jpayne@68 139
jpayne@68 140 int number=0;
jpayne@68 141 for(int i=initial+1; i<s.length(); i++){
jpayne@68 142 char c=s.charAt(i);
jpayne@68 143 if(c==delimiter){break;}
jpayne@68 144 assert(Tools.isDigit(c));
jpayne@68 145 number=(number*10)+(c-'0');
jpayne@68 146 }
jpayne@68 147 // System.err.println("j: "+number);
jpayne@68 148 return number;
jpayne@68 149 }
jpayne@68 150
jpayne@68 151 /** Parse a ncbi number, or return -1 if formatted incorrectly. */
jpayne@68 152 public static int parseTaxidNumber(String s, char delimiter){
jpayne@68 153 if(s==null || s.length()<5){return -1;}
jpayne@68 154 if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);}
jpayne@68 155 if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;}
jpayne@68 156 int initial=s.indexOf(delimiter);
jpayne@68 157 if(initial<0){
jpayne@68 158 delimiter='_';
jpayne@68 159 initial=s.indexOf(delimiter);
jpayne@68 160 if(initial<0){return -1;}
jpayne@68 161 }
jpayne@68 162 if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
jpayne@68 163
jpayne@68 164 int number=0;
jpayne@68 165 for(int i=initial+1; i<s.length(); i++){
jpayne@68 166 char c=s.charAt(i);
jpayne@68 167 if(c==delimiter || c==' '){break;}
jpayne@68 168 assert(Tools.isDigit(c)) : c+"\n"+s;
jpayne@68 169 number=(number*10)+(c-'0');
jpayne@68 170 }
jpayne@68 171 return number;
jpayne@68 172 }
jpayne@68 173
jpayne@68 174
jpayne@68 175 public static int getID(String s){return getID(s, '|');}
jpayne@68 176 /** Get the taxID from a header starting with a taxID or gi number */
jpayne@68 177 public static int getID(String s, char delimiter){
jpayne@68 178 int x=parseTaxidNumber(s, delimiter);
jpayne@68 179 if(x>=0){return x;}
jpayne@68 180 x=parseGiNumber(s, delimiter);
jpayne@68 181 if(x>=0){return array[x];}
jpayne@68 182 return -1;
jpayne@68 183 }
jpayne@68 184
jpayne@68 185 /** Parse a gi number, or return -1 if formatted incorrectly. */
jpayne@68 186 static int parseGiNumber(byte[] s, char delimiter){
jpayne@68 187 if(s==null || s.length<4){return -1;}
jpayne@68 188 if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
jpayne@68 189 int initial=Tools.indexOf(s, (byte)delimiter);
jpayne@68 190 if(initial<0){
jpayne@68 191 delimiter='_';
jpayne@68 192 initial=Tools.indexOf(s, (byte)delimiter);
jpayne@68 193 if(initial<0){return -1;}
jpayne@68 194 }
jpayne@68 195 if(!Tools.isDigit(s[initial+1])){return -1;}
jpayne@68 196
jpayne@68 197 long number=0;
jpayne@68 198 for(int i=initial+1; i<s.length; i++){
jpayne@68 199 byte c=s[i];
jpayne@68 200 if(c==delimiter){break;}
jpayne@68 201 assert(Tools.isDigit(c));
jpayne@68 202 number=(number*10)+(c-'0');
jpayne@68 203 }
jpayne@68 204 return (int)number;
jpayne@68 205 }
jpayne@68 206
jpayne@68 207 /** Parse a gi number, or return -1 if formatted incorrectly. */
jpayne@68 208 static int parseNcbiNumber(byte[] s, char delimiter){
jpayne@68 209 if(s==null || s.length<3){return -1;}
jpayne@68 210 if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;}
jpayne@68 211 int initial=Tools.indexOf(s, (byte)delimiter);
jpayne@68 212 if(initial<0){
jpayne@68 213 delimiter='_';
jpayne@68 214 initial=Tools.indexOf(s, (byte)delimiter);
jpayne@68 215 if(initial<0){return -1;}
jpayne@68 216 }
jpayne@68 217 if(!Tools.isDigit(s[initial+1])){return -1;}
jpayne@68 218
jpayne@68 219 int number=0;
jpayne@68 220 for(int i=initial+1; i<s.length; i++){
jpayne@68 221 byte c=s[i];
jpayne@68 222 if(c==delimiter){break;}
jpayne@68 223 assert(Tools.isDigit(c));
jpayne@68 224 number=(number*10)+(c-'0');
jpayne@68 225 }
jpayne@68 226 return number;
jpayne@68 227 }
jpayne@68 228
jpayne@68 229 public static int getID(byte[] s){return getID(s, '|');}
jpayne@68 230 /** Get the taxID from a header starting with a taxID or gi number */
jpayne@68 231 public static int getID(byte[] s, char delimiter){
jpayne@68 232 int x=parseGiNumber(s, delimiter);
jpayne@68 233 if(x>=0){return array[x];}
jpayne@68 234 return parseNcbiNumber(s, delimiter);
jpayne@68 235 }
jpayne@68 236
jpayne@68 237 /** Get the taxID from a gi number */
jpayne@68 238 public static int getID(long gi){
jpayne@68 239 assert(gi>=0) : gi;
jpayne@68 240 assert(gi<Integer.MAX_VALUE) : gi+" > "+Integer.MAX_VALUE;
jpayne@68 241 assert(gi<array.length) : gi+", "+array.length;
jpayne@68 242 return array[(int)gi];
jpayne@68 243 }
jpayne@68 244
jpayne@68 245 public static void initialize(String fname){
jpayne@68 246 assert(fname!=null);
jpayne@68 247 if(fileString==null || !fileString.equals(fname)){
jpayne@68 248 synchronized(GiToTaxid.class){
jpayne@68 249 if(!initialized || fileString==null || !fileString.equals(fname)){
jpayne@68 250 fileString=fname;
jpayne@68 251 if(fname.contains(".int1d")){
jpayne@68 252 array=ReadWrite.read(int[].class, fname, true);
jpayne@68 253 }else{
jpayne@68 254 array=makeArray(fname);
jpayne@68 255 }
jpayne@68 256 }
jpayne@68 257 initialized=true;
jpayne@68 258 }
jpayne@68 259 }
jpayne@68 260 }
jpayne@68 261
jpayne@68 262 public static boolean isInitialized(){return initialized;}
jpayne@68 263
jpayne@68 264 public static synchronized void unload(){
jpayne@68 265 array=null;
jpayne@68 266 fileString=null;
jpayne@68 267 initialized=false;
jpayne@68 268 }
jpayne@68 269
jpayne@68 270 private static int[] makeArray(String fnames){
jpayne@68 271 String[] split;
jpayne@68 272 if(new File(fnames).exists()){split=new String[] {fnames};}
jpayne@68 273 else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
jpayne@68 274 else if(fnames.indexOf('#')>=0){
jpayne@68 275 assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
jpayne@68 276 File dir=new File(System.getProperty("user.dir"));
jpayne@68 277 String prefix=fnames.substring(0, fnames.indexOf('#'));
jpayne@68 278 String suffix=fnames.substring(fnames.indexOf('#')+1);
jpayne@68 279
jpayne@68 280 File[] array=dir.listFiles();
jpayne@68 281 StringBuilder sb=new StringBuilder();
jpayne@68 282 String comma="";
jpayne@68 283 for(File f : array){
jpayne@68 284 String s=f.getName();
jpayne@68 285 if(s.startsWith(prefix) && s.startsWith(suffix)){
jpayne@68 286 sb.append(comma);
jpayne@68 287 sb.append(s);
jpayne@68 288 comma=",";
jpayne@68 289 }
jpayne@68 290 }
jpayne@68 291 split=sb.toString().split(",");
jpayne@68 292 }else{
jpayne@68 293 throw new RuntimeException("Invalid file: "+fnames);
jpayne@68 294 }
jpayne@68 295
jpayne@68 296 IntList list=new IntList();
jpayne@68 297 // assert(max<Integer.MAX_VALUE) : "Overflow.";
jpayne@68 298 // int[] x=new int[(int)max+1];
jpayne@68 299 // Arrays.fill(x, -1);
jpayne@68 300
jpayne@68 301 long total=0;
jpayne@68 302 for(String s : split){
jpayne@68 303 long count=addToList(s, list);
jpayne@68 304 total+=count;
jpayne@68 305 }
jpayne@68 306 return list.shrink().array;
jpayne@68 307 }
jpayne@68 308
jpayne@68 309 private static long addToList(String fname, IntList list){
jpayne@68 310 boolean warned=false;
jpayne@68 311 ByteFile bf=ByteFile.makeByteFile(fname, true);
jpayne@68 312 long count=0, invalid=0;
jpayne@68 313 byte[] line=bf.nextLine();
jpayne@68 314 while(line!=null){
jpayne@68 315 if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na
jpayne@68 316 count++;
jpayne@68 317 int tab2=Tools.indexOfNth(line, '\t', 2);
jpayne@68 318 int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1);
jpayne@68 319 assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
jpayne@68 320 assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
jpayne@68 321 assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
jpayne@68 322 // assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
jpayne@68 323 int tid=Parse.parseInt(line, tab2+1, tab3);
jpayne@68 324 int gi=Parse.parseInt(line, tab3+1, line.length);
jpayne@68 325 if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
jpayne@68 326 invalid++;
jpayne@68 327 }else{
jpayne@68 328 assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
jpayne@68 329 int old=list.get(gi);
jpayne@68 330 assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
jpayne@68 331
jpayne@68 332 list.set(gi, tid);
jpayne@68 333
jpayne@68 334 //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
jpayne@68 335 // if(x[gi]!=-1 && x[gi]!=ncbi){
jpayne@68 336 // if(!warned){
jpayne@68 337 // System.err.println("***WARNING*** For file "+fname+":\n"+
jpayne@68 338 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
jpayne@68 339 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
jpayne@68 340 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
jpayne@68 341 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
jpayne@68 342 // warned=true;
jpayne@68 343 // }
jpayne@68 344 // }else{
jpayne@68 345 // x[gi]=ncbi;
jpayne@68 346 // }
jpayne@68 347 }
jpayne@68 348 }else{
jpayne@68 349 if(line.length==0){System.err.println(fname+", "+count);}//debug
jpayne@68 350 invalid++;
jpayne@68 351 }
jpayne@68 352 line=bf.nextLine();
jpayne@68 353 }
jpayne@68 354 if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);}
jpayne@68 355 bf.close();
jpayne@68 356 return count;
jpayne@68 357 }
jpayne@68 358
jpayne@68 359 // private static int[] makeArrayOld(String fnames){
jpayne@68 360 // String[] split;
jpayne@68 361 // if(new File(fnames).exists()){split=new String[] {fnames};}
jpayne@68 362 // else{split=fnames.split(",");}
jpayne@68 363 //
jpayne@68 364 // long max=0;
jpayne@68 365 // for(String s : split){
jpayne@68 366 // max=Tools.max(max, findMaxID(s));
jpayne@68 367 // }
jpayne@68 368 //
jpayne@68 369 // assert(max<Integer.MAX_VALUE) : "Overflow.";
jpayne@68 370 // int[] x=new int[(int)max+1];
jpayne@68 371 // Arrays.fill(x, -1);
jpayne@68 372 //
jpayne@68 373 // long total=0;
jpayne@68 374 // for(String s : split){
jpayne@68 375 // long count=fillArray(s, x);
jpayne@68 376 // total+=count;
jpayne@68 377 // }
jpayne@68 378 // return x;
jpayne@68 379 // }
jpayne@68 380 //
jpayne@68 381 // private static long findMaxID(String fname){
jpayne@68 382 // ByteFile bf=ByteFile.makeByteFile(fname, true);
jpayne@68 383 // long count=0, max=0;
jpayne@68 384 // byte[] line=bf.nextLine();
jpayne@68 385 // while(line!=null){
jpayne@68 386 // count++;
jpayne@68 387 // int tab=Tools.indexOf(line, (byte)'\t');
jpayne@68 388 // long gi=Parse.parseLong(line, 0, tab);
jpayne@68 389 // max=Tools.max(max, gi);
jpayne@68 390 // line=bf.nextLine();
jpayne@68 391 // }
jpayne@68 392 // bf.close();
jpayne@68 393 // return max;
jpayne@68 394 // }
jpayne@68 395 //
jpayne@68 396 // private static long fillArray(String fname, int[] x){
jpayne@68 397 // boolean warned=false;
jpayne@68 398 // ByteFile bf=ByteFile.makeByteFile(fname, true);
jpayne@68 399 // long count=0;
jpayne@68 400 // byte[] line=bf.nextLine();
jpayne@68 401 // while(line!=null){
jpayne@68 402 // count++;
jpayne@68 403 // int tab=Tools.indexOf(line, (byte)'\t');
jpayne@68 404 // int gi=Parse.parseInt(line, 0, tab);
jpayne@68 405 // int ncbi=Parse.parseInt(line, tab+1, line.length);
jpayne@68 406 // //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
jpayne@68 407 // if(x[gi]!=-1 && x[gi]!=ncbi){
jpayne@68 408 // if(!warned){
jpayne@68 409 // System.err.println("***WARNING*** For file "+fname+":\n"+
jpayne@68 410 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
jpayne@68 411 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n"
jpayne@68 412 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
jpayne@68 413 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
jpayne@68 414 // warned=true;
jpayne@68 415 // }
jpayne@68 416 // }else{
jpayne@68 417 // x[gi]=ncbi;
jpayne@68 418 // }
jpayne@68 419 // line=bf.nextLine();
jpayne@68 420 // }
jpayne@68 421 // if(verbose){System.err.println("Count: "+count);}
jpayne@68 422 // bf.close();
jpayne@68 423 // return count;
jpayne@68 424 // }
jpayne@68 425
jpayne@68 426 private static int[] array;
jpayne@68 427 private static String fileString;
jpayne@68 428
jpayne@68 429 public static boolean verbose=false;
jpayne@68 430 private static boolean initialized=false;
jpayne@68 431 }