jpayne@68: package tax; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.util.ArrayList; jpayne@68: jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.ReadWrite; jpayne@68: import shared.Parse; jpayne@68: import shared.Shared; jpayne@68: import shared.Tools; jpayne@68: import structures.IntList; jpayne@68: jpayne@68: /** jpayne@68: * @author Brian Bushnell jpayne@68: * @date Mar 10, 2015 jpayne@68: * jpayne@68: */ jpayne@68: public class GiToTaxid { jpayne@68: jpayne@68: public static void main(String[] args){ jpayne@68: ReadWrite.USE_UNPIGZ=true; jpayne@68: ReadWrite.USE_PIGZ=true; jpayne@68: ReadWrite.ZIPLEVEL=9; jpayne@68: ReadWrite.PIGZ_BLOCKSIZE=256; jpayne@68: // ReadWrite.PIGZ_ITERATIONS=30; jpayne@68: jpayne@68: for(String arg : args){ jpayne@68: String[] split=arg.split("="); jpayne@68: String a=split[0].toLowerCase(); jpayne@68: String b=split.length>1 ? split[1] : null; jpayne@68: shared.Parser.parseZip(arg, a, b); jpayne@68: } jpayne@68: // if(args.length>2 && false){//Run a test jpayne@68: // test(args); jpayne@68: // }else jpayne@68: if(args.length>=2){//Write array jpayne@68: initialize(args[0]); jpayne@68: ReadWrite.write(array, args[1], true); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public static void test(String[] args){ jpayne@68: System.err.println(getID(1000)); jpayne@68: System.err.println(getID(10000)); jpayne@68: System.err.println(getID(10001)); jpayne@68: System.err.println(getID(10002)); jpayne@68: System.err.println(getID(10003)); jpayne@68: System.err.println(getID(10004)); jpayne@68: System.err.println(getID(10005)); jpayne@68: System.err.println(getID(100000)); jpayne@68: System.err.println(getID(1000000)); jpayne@68: System.err.println(getID(10000000)); jpayne@68: jpayne@68: TaxTree tree=null; jpayne@68: if(args.length>1){ jpayne@68: tree=TaxTree.loadTaxTree(args[0], System.err, true, true); jpayne@68: } jpayne@68: jpayne@68: System.err.println("Strings:"); jpayne@68: int x; jpayne@68: x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44"); jpayne@68: System.err.println(x); jpayne@68: if(tree!=null){ jpayne@68: System.err.println(tree.getNode(x)); jpayne@68: tree.incrementRaw(x, 30); jpayne@68: } jpayne@68: x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds"); jpayne@68: System.err.println(x); jpayne@68: if(tree!=null){ jpayne@68: System.err.println(tree.getNode(x)); jpayne@68: tree.incrementRaw(x, 40); jpayne@68: } jpayne@68: x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence"); jpayne@68: System.err.println(x); jpayne@68: if(tree!=null){ jpayne@68: System.err.println(tree.getNode(x)); jpayne@68: tree.incrementRaw(x, 20); jpayne@68: } jpayne@68: jpayne@68: if(tree!=null){ jpayne@68: tree.percolateUp(); jpayne@68: ArrayList nodes=tree.gatherNodesAtLeastLimit(35); jpayne@68: for(TaxNode n : nodes){ jpayne@68: System.err.println(n); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');} jpayne@68: public static int parseGiToTaxid(String s, char delimiter){ jpayne@68: long x=parseGiNumber(s, delimiter); jpayne@68: assert(x>=0) : x+", "+s; jpayne@68: return getID(x); jpayne@68: } jpayne@68: jpayne@68: jpayne@68: public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');} jpayne@68: public static int parseGiToTaxid(byte[] s, char delimiter){ jpayne@68: long x=parseGiNumber(s, delimiter); jpayne@68: return x<0 ? -1 : getID(x); jpayne@68: } jpayne@68: jpayne@68: /** Parse a gi number, or return -1 if formatted incorrectly. */ jpayne@68: static long parseGiNumber(String s, char delimiter){ jpayne@68: if(s==null || s.length()<4){return -1;} jpayne@68: if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);} jpayne@68: if(!s.startsWith("gi")){return -1;} jpayne@68: int initial=s.indexOf(delimiter); jpayne@68: if(initial<0){ jpayne@68: if(delimiter!='~'){ jpayne@68: delimiter='~'; jpayne@68: initial=s.indexOf(delimiter); jpayne@68: } jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=s.indexOf(delimiter); jpayne@68: } jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s.charAt(initial+1))){return -1;} jpayne@68: jpayne@68: long number=0; jpayne@68: for(int i=initial+1; i'){return parseTaxidNumber(s.substring(1), delimiter);} jpayne@68: if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;} jpayne@68: int initial=s.indexOf(delimiter); jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=s.indexOf(delimiter); jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s.charAt(initial+1))){return -1;} jpayne@68: jpayne@68: int number=0; jpayne@68: for(int i=initial+1; i=0){return (int)x;} jpayne@68: x=parseGiNumber(s, delimiter); jpayne@68: return x<0 ? -1 : getID(x); jpayne@68: } jpayne@68: jpayne@68: /** Parse a gi number, or return -1 if formatted incorrectly. */ jpayne@68: static long parseGiNumber(byte[] s, char delimiter){ jpayne@68: if(s==null || s.length<4){return -1;} jpayne@68: if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;} jpayne@68: int initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s[initial+1])){return -1;} jpayne@68: jpayne@68: long number=0; jpayne@68: for(int i=initial+1; incbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;} jpayne@68: int initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){ jpayne@68: delimiter='_'; jpayne@68: initial=Tools.indexOf(s, (byte)delimiter); jpayne@68: if(initial<0){return -1;} jpayne@68: } jpayne@68: if(!Tools.isDigit(s[initial+1])){return -1;} jpayne@68: jpayne@68: int number=0; jpayne@68: for(int i=initial+1; i=0){return getID(x, true);} jpayne@68: return parseNcbiNumber(s, delimiter); jpayne@68: } jpayne@68: jpayne@68: /** Get the taxID from a gi number; jpayne@68: * -1 if not present or invalid (negative input), jpayne@68: * -2 if out of range (too high) */ jpayne@68: public static int getID(long gi){ jpayne@68: return getID(gi, true); jpayne@68: } jpayne@68: jpayne@68: /** Get the taxID from a gi number; jpayne@68: * 0 if not present, jpayne@68: * -1 if invalid (negative input), jpayne@68: * -2 if out of range (too high) */ jpayne@68: public static int getID(long gi, boolean assertInRange){ jpayne@68: assert(initialized) : "To use gi numbers, you must load a gi table."; jpayne@68: if(gi<0 || gi>maxGiLoaded){ jpayne@68: assert(!assertInRange) : gi<0 ? "gi number "+gi+" is invalid." : jpayne@68: "The gi number "+gi+" is too big: Max loaded gi number is "+maxGiLoaded+".\n" jpayne@68: + "Please update the gi table with the latest version from NCBI" jpayne@68: + " as per the instructions in gitable.sh.\n" jpayne@68: + "To ignore this problem, please run with the -da flag.\n"; jpayne@68: return gi<0 ? -1 : -2; jpayne@68: } jpayne@68: final long upper=gi>>>SHIFT; jpayne@68: final int lower=(int)(gi&LOWERMASK); jpayne@68: assert(upper0){ jpayne@68: int upper=array.length-1; jpayne@68: int[] section=array[upper]; jpayne@68: int lower=section.length-1; jpayne@68: maxGiLoaded=(((long)upper)<=0){split=fnames.split(",");} jpayne@68: else if(fnames.indexOf('#')>=0){ jpayne@68: assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for " jpayne@68: + "relative paths in present working directory."; jpayne@68: File dir=new File(System.getProperty("user.dir")); jpayne@68: String prefix=fnames.substring(0, fnames.indexOf('#')); jpayne@68: String suffix=fnames.substring(fnames.indexOf('#')+1); jpayne@68: jpayne@68: File[] array=dir.listFiles(); jpayne@68: StringBuilder sb=new StringBuilder(); jpayne@68: String comma=""; jpayne@68: for(File f : array){ jpayne@68: String s=f.getName(); jpayne@68: if(s.startsWith(prefix) && s.startsWith(suffix)){ jpayne@68: sb.append(comma); jpayne@68: sb.append(s); jpayne@68: comma=","; jpayne@68: } jpayne@68: } jpayne@68: split=sb.toString().split(","); jpayne@68: }else{ jpayne@68: throw new RuntimeException("Invalid file: "+fnames); jpayne@68: } jpayne@68: jpayne@68: int numLists=32; jpayne@68: IntList[] lists=new IntList[numLists]; jpayne@68: jpayne@68: long total=0; jpayne@68: for(String s : split){ jpayne@68: long count=addToList(s, lists); jpayne@68: total+=count; jpayne@68: } jpayne@68: for(int i=0; i0){ jpayne@68: lists[i].shrink(); jpayne@68: numLists=i+1; jpayne@68: } jpayne@68: } jpayne@68: int[][] table=new int[numLists][]; jpayne@68: for(int i=0; i0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na jpayne@68: count++; jpayne@68: int tab2=Tools.indexOfNth(line, '\t', 2); jpayne@68: int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1); jpayne@68: assert(tab2>0 && (tab2=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'"; jpayne@68: int old=setID(gi, tid, lists); jpayne@68: assert(old<1 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3; jpayne@68: } jpayne@68: }else{ jpayne@68: //if(line.length==0){System.err.println(fname+", "+count);}//debug jpayne@68: invalid++; jpayne@68: } jpayne@68: line=bf.nextLine(); jpayne@68: } jpayne@68: if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);} jpayne@68: bf.close(); jpayne@68: return count; jpayne@68: } jpayne@68: jpayne@68: private static int getID(long gi, IntList[] lists){ jpayne@68: assert(gi>=0) : "gi number "+gi+" is invalid."; jpayne@68: final long upper=gi>>>SHIFT; jpayne@68: final int lower=(int)(gi&LOWERMASK); jpayne@68: assert(upper=list.size ? -2 : list.get(lower); jpayne@68: } jpayne@68: jpayne@68: private static int setID(long gi, int tid, IntList[] lists){ jpayne@68: assert(gi>=0) : "gi number "+gi+" is invalid."; jpayne@68: final long upper=gi>>>SHIFT; jpayne@68: final int lower=(int)(gi&LOWERMASK); jpayne@68: assert(upper=list.size ? -2 : list.get(lower); jpayne@68: list.set(lower, tid); jpayne@68: maxGiLoaded=Tools.max(gi, maxGiLoaded); jpayne@68: return old; jpayne@68: } jpayne@68: jpayne@68: // private static int[] makeArrayOld(String fnames){ jpayne@68: // String[] split; jpayne@68: // if(new File(fnames).exists()){split=new String[] {fnames};} jpayne@68: // else{split=fnames.split(",");} jpayne@68: // jpayne@68: // long max=0; jpayne@68: // for(String s : split){ jpayne@68: // max=Tools.max(max, findMaxID(s)); jpayne@68: // } jpayne@68: // jpayne@68: // assert(max "+ncbi; jpayne@68: // if(x[gi]!=-1 && x[gi]!=ncbi){ jpayne@68: // if(!warned){ jpayne@68: // System.err.println("***WARNING*** For file "+fname+":\n"+ jpayne@68: // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ jpayne@68: // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" jpayne@68: // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" jpayne@68: // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); jpayne@68: // warned=true; jpayne@68: // } jpayne@68: // }else{ jpayne@68: // x[gi]=ncbi; jpayne@68: // } jpayne@68: // line=bf.nextLine(); jpayne@68: // } jpayne@68: // if(verbose){System.err.println("Count: "+count);} jpayne@68: // bf.close(); jpayne@68: // return count; jpayne@68: // } jpayne@68: jpayne@68: private static long maxGiLoaded=-1; jpayne@68: private static int[][] array; jpayne@68: private static final int SHIFT=30; jpayne@68: private static final long UPPERMASK=(-1L)<