Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/GiToTaxid.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package tax; import java.io.File; import java.util.ArrayList; import fileIO.ByteFile; import fileIO.ReadWrite; import shared.Parse; import shared.Shared; import shared.Tools; import structures.IntList; /** * @author Brian Bushnell * @date Mar 10, 2015 * */ public class GiToTaxid { public static void main(String[] args){ ReadWrite.USE_UNPIGZ=true; ReadWrite.USE_PIGZ=true; ReadWrite.ZIPLEVEL=9; ReadWrite.PIGZ_BLOCKSIZE=256; // ReadWrite.PIGZ_ITERATIONS=30; for(String arg : args){ String[] split=arg.split("="); String a=split[0].toLowerCase(); String b=split.length>1 ? split[1] : null; shared.Parser.parseZip(arg, a, b); } // if(args.length>2 && false){//Run a test // test(args); // }else if(args.length>=2){//Write array initialize(args[0]); ReadWrite.write(array, args[1], true); } } public static void test(String[] args){ System.err.println(getID(1000)); System.err.println(getID(10000)); System.err.println(getID(10001)); System.err.println(getID(10002)); System.err.println(getID(10003)); System.err.println(getID(10004)); System.err.println(getID(10005)); System.err.println(getID(100000)); System.err.println(getID(1000000)); System.err.println(getID(10000000)); TaxTree tree=null; if(args.length>1){ tree=TaxTree.loadTaxTree(args[0], System.err, true, true); } System.err.println("Strings:"); int x; x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44"); System.err.println(x); if(tree!=null){ System.err.println(tree.getNode(x)); tree.incrementRaw(x, 30); } x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds"); System.err.println(x); if(tree!=null){ System.err.println(tree.getNode(x)); tree.incrementRaw(x, 40); } x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence"); System.err.println(x); if(tree!=null){ System.err.println(tree.getNode(x)); tree.incrementRaw(x, 20); } if(tree!=null){ tree.percolateUp(); ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35); for(TaxNode n : nodes){ System.err.println(n); } } } public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');} public static int parseGiToTaxid(String s, char delimiter){ long x=parseGiNumber(s, delimiter); assert(x>=0) : x+", "+s; return getID(x); } public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');} public static int parseGiToTaxid(byte[] s, char delimiter){ long x=parseGiNumber(s, delimiter); return x<0 ? -1 : getID(x); } /** Parse a gi number, or return -1 if formatted incorrectly. */ static long parseGiNumber(String s, char delimiter){ if(s==null || s.length()<4){return -1;} if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);} if(!s.startsWith("gi")){return -1;} int initial=s.indexOf(delimiter); if(initial<0){ if(delimiter!='~'){ delimiter='~'; initial=s.indexOf(delimiter); } if(initial<0){ delimiter='_'; initial=s.indexOf(delimiter); } if(initial<0){return -1;} } if(!Tools.isDigit(s.charAt(initial+1))){return -1;} long number=0; for(int i=initial+1; i<s.length(); i++){ char c=s.charAt(i); if(c==delimiter){break;} assert(Tools.isDigit(c)); number=(number*10)+(c-'0'); } return number; } /** Parse a ncbi number, or return -1 if formatted incorrectly. */ public static int parseTaxidNumber(String s, char delimiter){ if(s==null || s.length()<5){return -1;} if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);} if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;} int initial=s.indexOf(delimiter); if(initial<0){ delimiter='_'; initial=s.indexOf(delimiter); if(initial<0){return -1;} } if(!Tools.isDigit(s.charAt(initial+1))){return -1;} int number=0; for(int i=initial+1; i<s.length(); i++){ char c=s.charAt(i); if(c==delimiter || c==' '){break;} assert(Tools.isDigit(c)) : c+"\n"+s; number=(number*10)+(c-'0'); } return number; } public static int getID(String s){return getID(s, '|');} /** Get the taxID from a header starting with a taxID or gi number */ public static int getID(String s, char delimiter){ long x=parseTaxidNumber(s, delimiter); if(x>=0){return (int)x;} x=parseGiNumber(s, delimiter); return x<0 ? -1 : getID(x); } /** Parse a gi number, or return -1 if formatted incorrectly. */ static long parseGiNumber(byte[] s, char delimiter){ if(s==null || s.length<4){return -1;} if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;} int initial=Tools.indexOf(s, (byte)delimiter); if(initial<0){ delimiter='_'; initial=Tools.indexOf(s, (byte)delimiter); if(initial<0){return -1;} } if(!Tools.isDigit(s[initial+1])){return -1;} long number=0; for(int i=initial+1; i<s.length; i++){ byte c=s[i]; if(c==delimiter){break;} assert(Tools.isDigit(c)); number=(number*10)+(c-'0'); } return number; } /** Parse a gi number, or return -1 if formatted incorrectly. */ static int parseNcbiNumber(byte[] s, char delimiter){ if(s==null || s.length<3){return -1;} if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;} int initial=Tools.indexOf(s, (byte)delimiter); if(initial<0){ delimiter='_'; initial=Tools.indexOf(s, (byte)delimiter); if(initial<0){return -1;} } if(!Tools.isDigit(s[initial+1])){return -1;} int number=0; for(int i=initial+1; i<s.length; i++){ byte c=s[i]; if(c==delimiter){break;} assert(Tools.isDigit(c)); number=(number*10)+(c-'0'); } return number; } public static int getID(byte[] s){return getID(s, '|');} /** Get the taxID from a header starting with a taxID or gi number */ public static int getID(byte[] s, char delimiter){ long x=parseGiNumber(s, delimiter); if(x>=0){return getID(x, true);} return parseNcbiNumber(s, delimiter); } /** Get the taxID from a gi number; * -1 if not present or invalid (negative input), * -2 if out of range (too high) */ public static int getID(long gi){ return getID(gi, true); } /** Get the taxID from a gi number; * 0 if not present, * -1 if invalid (negative input), * -2 if out of range (too high) */ public static int getID(long gi, boolean assertInRange){ assert(initialized) : "To use gi numbers, you must load a gi table."; if(gi<0 || gi>maxGiLoaded){ assert(!assertInRange) : gi<0 ? "gi number "+gi+" is invalid." : "The gi number "+gi+" is too big: Max loaded gi number is "+maxGiLoaded+".\n" + "Please update the gi table with the latest version from NCBI" + " as per the instructions in gitable.sh.\n" + "To ignore this problem, please run with the -da flag.\n"; return gi<0 ? -1 : -2; } final long upper=gi>>>SHIFT; final int lower=(int)(gi&LOWERMASK); assert(upper<Shared.MAX_ARRAY_LEN && upper<array.length) : gi+", "+upper+", "+array.length; final int[] slice=array[(int)upper]; return slice==null || slice.length<=lower ? 0 : slice[lower]; } public static void initialize(String fname){ assert(fname!=null); if(fileString==null || !fileString.equals(fname)){ synchronized(GiToTaxid.class){ if(!initialized || fileString==null || !fileString.equals(fname)){ fileString=fname; if(fname.contains(".int2d")){ array=ReadWrite.read(int[][].class, fname, true); maxGiLoaded=-1; if(array!=null && array.length>0){ int upper=array.length-1; int[] section=array[upper]; int lower=section.length-1; maxGiLoaded=(((long)upper)<<SHIFT)|lower; } }else if(fname.contains(".int1d")){ throw new RuntimeException("Old gi table format filename "+fname+".\n" + "Current files should end in .int2d."); }else{ array=makeArray(fname); } } initialized=true; } } } public static boolean isInitialized(){return initialized;} public static synchronized void unload(){ maxGiLoaded=-1; array=null; fileString=null; initialized=false; } private static int[][] makeArray(String fnames){ String[] split; if(new File(fnames).exists()){split=new String[] {fnames};} else if(fnames.indexOf(',')>=0){split=fnames.split(",");} else if(fnames.indexOf('#')>=0){ assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for " + "relative paths in present working directory."; File dir=new File(System.getProperty("user.dir")); String prefix=fnames.substring(0, fnames.indexOf('#')); String suffix=fnames.substring(fnames.indexOf('#')+1); File[] array=dir.listFiles(); StringBuilder sb=new StringBuilder(); String comma=""; for(File f : array){ String s=f.getName(); if(s.startsWith(prefix) && s.startsWith(suffix)){ sb.append(comma); sb.append(s); comma=","; } } split=sb.toString().split(","); }else{ throw new RuntimeException("Invalid file: "+fnames); } int numLists=32; IntList[] lists=new IntList[numLists]; long total=0; for(String s : split){ long count=addToList(s, lists); total+=count; } for(int i=0; i<lists.length; i++){ if(lists[i]!=null && lists[i].size>0){ lists[i].shrink(); numLists=i+1; } } int[][] table=new int[numLists][]; for(int i=0; i<numLists; i++){ table[i]=lists[i].array; } return table; } private static long addToList(String fname, IntList[] lists){ boolean warned=false; ByteFile bf=ByteFile.makeByteFile(fname, true); long count=0, invalid=0; byte[] line=bf.nextLine(); while(line!=null){ if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na count++; int tab2=Tools.indexOfNth(line, '\t', 2); int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1); assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length; assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'"; assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'"; //assert(false) : tab2+", "+tab3+", '"+new String(line)+"'"; int tid=Parse.parseInt(line, tab2+1, tab3); int gi=Parse.parseInt(line, tab3+1, line.length); if(gi<0){ invalid++; }else{ assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'"; int old=setID(gi, tid, lists); assert(old<1 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3; } }else{ //if(line.length==0){System.err.println(fname+", "+count);}//debug invalid++; } line=bf.nextLine(); } if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);} bf.close(); return count; } private static int getID(long gi, IntList[] lists){ assert(gi>=0) : "gi number "+gi+" is invalid."; final long upper=gi>>>SHIFT; final int lower=(int)(gi&LOWERMASK); assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper; IntList list=lists[(int)upper]; return lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower); } private static int setID(long gi, int tid, IntList[] lists){ assert(gi>=0) : "gi number "+gi+" is invalid."; final long upper=gi>>>SHIFT; final int lower=(int)(gi&LOWERMASK); assert(upper<Shared.MAX_ARRAY_LEN) : gi+", "+upper; IntList list=lists[(int)upper]; if(list==null){list=lists[(int)upper]=new IntList();} int old=lower<0 ? -1 : lower>=list.size ? -2 : list.get(lower); list.set(lower, tid); maxGiLoaded=Tools.max(gi, maxGiLoaded); return old; } // private static int[] makeArrayOld(String fnames){ // String[] split; // if(new File(fnames).exists()){split=new String[] {fnames};} // else{split=fnames.split(",");} // // long max=0; // for(String s : split){ // max=Tools.max(max, findMaxID(s)); // } // // assert(max<Integer.MAX_VALUE) : "Overflow."; // int[] x=new int[(int)max+1]; // Arrays.fill(x, -1); // // long total=0; // for(String s : split){ // long count=fillArray(s, x); // total+=count; // } // return x; // } // // private static long findMaxID(String fname){ // ByteFile bf=ByteFile.makeByteFile(fname, true); // long count=0, max=0; // byte[] line=bf.nextLine(); // while(line!=null){ // count++; // int tab=Tools.indexOf(line, (byte)'\t'); // long gi=Parse.parseLong(line, 0, tab); // max=Tools.max(max, gi); // line=bf.nextLine(); // } // bf.close(); // return max; // } // // private static long fillArray(String fname, int[] x){ // boolean warned=false; // ByteFile bf=ByteFile.makeByteFile(fname, true); // long count=0; // byte[] line=bf.nextLine(); // while(line!=null){ // count++; // int tab=Tools.indexOf(line, (byte)'\t'); // int gi=Parse.parseInt(line, 0, tab); // int ncbi=Parse.parseInt(line, tab+1, line.length); // //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi; // if(x[gi]!=-1 && x[gi]!=ncbi){ // if(!warned){ // System.err.println("***WARNING*** For file "+fname+":\n"+ // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); // warned=true; // } // }else{ // x[gi]=ncbi; // } // line=bf.nextLine(); // } // if(verbose){System.err.println("Count: "+count);} // bf.close(); // return count; // } private static long maxGiLoaded=-1; private static int[][] array; private static final int SHIFT=30; private static final long UPPERMASK=(-1L)<<SHIFT; private static final long LOWERMASK=~UPPERMASK; private static String fileString; public static boolean verbose=false; private static boolean initialized=false; }