jpayne@68: package tax; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.util.ArrayList; jpayne@68: jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.ReadWrite; jpayne@68: import shared.Parse; jpayne@68: import shared.Shared; jpayne@68: import shared.Tools; jpayne@68: import structures.IntList; jpayne@68: jpayne@68: /** jpayne@68: * @author Brian Bushnell jpayne@68: * @date Mar 10, 2015 jpayne@68: * jpayne@68: */ jpayne@68: public class GiToTaxidInt { jpayne@68: jpayne@68: public static void main(String[] args){ jpayne@68: ReadWrite.USE_UNPIGZ=true; jpayne@68: ReadWrite.USE_PIGZ=true; jpayne@68: ReadWrite.ZIPLEVEL=9; jpayne@68: ReadWrite.PIGZ_BLOCKSIZE=256; jpayne@68: // ReadWrite.PIGZ_ITERATIONS=30; jpayne@68: jpayne@68: for(String arg : args){ jpayne@68: String[] split=arg.split("="); jpayne@68: String a=split[0].toLowerCase(); jpayne@68: String b=split.length>1 ? split[1] : null; jpayne@68: shared.Parser.parseZip(arg, a, b); jpayne@68: } jpayne@68: // if(args.length>2 && false){//Run a test jpayne@68: // test(args); jpayne@68: // }else jpayne@68: if(args.length>=2){//Write array jpayne@68: initialize(args[0]); jpayne@68: ReadWrite.write(array, args[1], true); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public static void test(String[] args){ jpayne@68: System.err.println(getID(1000)); jpayne@68: System.err.println(getID(10000)); jpayne@68: System.err.println(getID(10001)); jpayne@68: System.err.println(getID(10002)); jpayne@68: System.err.println(getID(10003)); jpayne@68: System.err.println(getID(10004)); jpayne@68: System.err.println(getID(10005)); jpayne@68: System.err.println(getID(100000)); jpayne@68: System.err.println(getID(1000000)); jpayne@68: System.err.println(getID(10000000)); jpayne@68: jpayne@68: TaxTree tree=null; jpayne@68: if(args.length>1){ jpayne@68: tree=TaxTree.loadTaxTree(args[0], System.err, true, true); jpayne@68: } jpayne@68: jpayne@68: System.err.println("Strings:"); jpayne@68: int x; jpayne@68: x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44"); jpayne@68: System.err.println(x); jpayne@68: if(tree!=null){ jpayne@68: System.err.println(tree.getNode(x)); jpayne@68: tree.incrementRaw(x, 30); jpayne@68: } jpayne@68: x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds"); jpayne@68: System.err.println(x); jpayne@68: if(tree!=null){ jpayne@68: System.err.println(tree.getNode(x)); jpayne@68: tree.incrementRaw(x, 40); jpayne@68: } jpayne@68: x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence"); jpayne@68: System.err.println(x); jpayne@68: if(tree!=null){ jpayne@68: System.err.println(tree.getNode(x)); jpayne@68: tree.incrementRaw(x, 20); jpayne@68: } jpayne@68: jpayne@68: if(tree!=null){ jpayne@68: tree.percolateUp(); jpayne@68: ArrayList nodes=tree.gatherNodesAtLeastLimit(35); jpayne@68: for(TaxNode n : nodes){ jpayne@68: System.err.println(n); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');} jpayne@68: public static int parseGiToTaxid(String s, char delimiter){ jpayne@68: int x=parseGiNumber(s, delimiter); jpayne@68: assert(x>=0) : s; jpayne@68: assert(array!=null) : "To use gi numbers, you must load a gi table."; jpayne@68: // if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty. jpayne@68: if(x>=0 && x=0 && x'){return getID(s.substring(1), delimiter);} jpayne@68: // System.err.println("b"); jpayne@68: if(!s.startsWith("gi")){return -1;} jpayne@68: // System.err.println("c"); jpayne@68: // System.err.println("d"); jpayne@68: int initial=s.indexOf(delimiter); jpayne@68: // System.err.println("e"); jpayne@68: if(initial<0){ jpayne@68: if(delimiter!='~'){ jpayne@68: delimiter='~'; jpayne@68: initial=s.indexOf(delimiter); jpayne@68: } jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=s.indexOf(delimiter); jpayne@68: } jpayne@68: if(initial<0){return -1;} jpayne@68: // System.err.println("f"); jpayne@68: // System.err.println("g"); jpayne@68: } jpayne@68: // System.err.println("h"); jpayne@68: if(!Tools.isDigit(s.charAt(initial+1))){return -1;} jpayne@68: // System.err.println("i"); jpayne@68: jpayne@68: int number=0; jpayne@68: for(int i=initial+1; i'){return parseTaxidNumber(s.substring(1), delimiter);} jpayne@68: if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;} jpayne@68: int initial=s.indexOf(delimiter); jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=s.indexOf(delimiter); jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s.charAt(initial+1))){return -1;} jpayne@68: jpayne@68: int number=0; jpayne@68: for(int i=initial+1; i=0){return x;} jpayne@68: x=parseGiNumber(s, delimiter); jpayne@68: if(x>=0){return array[x];} jpayne@68: return -1; jpayne@68: } jpayne@68: jpayne@68: /** Parse a gi number, or return -1 if formatted incorrectly. */ jpayne@68: static int parseGiNumber(byte[] s, char delimiter){ jpayne@68: if(s==null || s.length<4){return -1;} jpayne@68: if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;} jpayne@68: int initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s[initial+1])){return -1;} jpayne@68: jpayne@68: long number=0; jpayne@68: for(int i=initial+1; incbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;} jpayne@68: int initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s[initial+1])){return -1;} jpayne@68: jpayne@68: int number=0; jpayne@68: for(int i=initial+1; i=0){return array[x];} jpayne@68: return parseNcbiNumber(s, delimiter); jpayne@68: } jpayne@68: jpayne@68: /** Get the taxID from a gi number */ jpayne@68: public static int getID(long gi){ jpayne@68: assert(gi>=0) : gi; jpayne@68: assert(gi "+Integer.MAX_VALUE; jpayne@68: assert(gi=0){split=fnames.split(",");} jpayne@68: else if(fnames.indexOf('#')>=0){ jpayne@68: assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory."; jpayne@68: File dir=new File(System.getProperty("user.dir")); jpayne@68: String prefix=fnames.substring(0, fnames.indexOf('#')); jpayne@68: String suffix=fnames.substring(fnames.indexOf('#')+1); jpayne@68: jpayne@68: File[] array=dir.listFiles(); jpayne@68: StringBuilder sb=new StringBuilder(); jpayne@68: String comma=""; jpayne@68: for(File f : array){ jpayne@68: String s=f.getName(); jpayne@68: if(s.startsWith(prefix) && s.startsWith(suffix)){ jpayne@68: sb.append(comma); jpayne@68: sb.append(s); jpayne@68: comma=","; jpayne@68: } jpayne@68: } jpayne@68: split=sb.toString().split(","); jpayne@68: }else{ jpayne@68: throw new RuntimeException("Invalid file: "+fnames); jpayne@68: } jpayne@68: jpayne@68: IntList list=new IntList(); jpayne@68: // assert(max0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na jpayne@68: count++; jpayne@68: int tab2=Tools.indexOfNth(line, '\t', 2); jpayne@68: int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1); jpayne@68: assert(tab2>0 && (tab2=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021. jpayne@68: invalid++; jpayne@68: }else{ jpayne@68: assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'"; jpayne@68: int old=list.get(gi); jpayne@68: assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3; jpayne@68: jpayne@68: list.set(gi, tid); jpayne@68: jpayne@68: //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi; jpayne@68: // if(x[gi]!=-1 && x[gi]!=ncbi){ jpayne@68: // if(!warned){ jpayne@68: // System.err.println("***WARNING*** For file "+fname+":\n"+ jpayne@68: // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ jpayne@68: // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" jpayne@68: // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" jpayne@68: // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); jpayne@68: // warned=true; jpayne@68: // } jpayne@68: // }else{ jpayne@68: // x[gi]=ncbi; jpayne@68: // } jpayne@68: } jpayne@68: }else{ jpayne@68: if(line.length==0){System.err.println(fname+", "+count);}//debug jpayne@68: invalid++; jpayne@68: } jpayne@68: line=bf.nextLine(); jpayne@68: } jpayne@68: if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);} jpayne@68: bf.close(); jpayne@68: return count; jpayne@68: } jpayne@68: jpayne@68: // private static int[] makeArrayOld(String fnames){ jpayne@68: // String[] split; jpayne@68: // if(new File(fnames).exists()){split=new String[] {fnames};} jpayne@68: // else{split=fnames.split(",");} jpayne@68: // jpayne@68: // long max=0; jpayne@68: // for(String s : split){ jpayne@68: // max=Tools.max(max, findMaxID(s)); jpayne@68: // } jpayne@68: // jpayne@68: // assert(max "+ncbi; jpayne@68: // if(x[gi]!=-1 && x[gi]!=ncbi){ jpayne@68: // if(!warned){ jpayne@68: // System.err.println("***WARNING*** For file "+fname+":\n"+ jpayne@68: // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ jpayne@68: // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" jpayne@68: // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" jpayne@68: // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); jpayne@68: // warned=true; jpayne@68: // } jpayne@68: // }else{ jpayne@68: // x[gi]=ncbi; jpayne@68: // } jpayne@68: // line=bf.nextLine(); jpayne@68: // } jpayne@68: // if(verbose){System.err.println("Count: "+count);} jpayne@68: // bf.close(); jpayne@68: // return count; jpayne@68: // } jpayne@68: jpayne@68: private static int[] array; jpayne@68: private static String fileString; jpayne@68: jpayne@68: public static boolean verbose=false; jpayne@68: private static boolean initialized=false; jpayne@68: }