jpayne@68: package sketch; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.Collections; jpayne@68: import java.util.LinkedHashSet; jpayne@68: import java.util.Set; jpayne@68: import java.util.concurrent.ConcurrentHashMap; jpayne@68: import java.util.concurrent.atomic.AtomicInteger; jpayne@68: import java.util.concurrent.atomic.AtomicLong; jpayne@68: jpayne@68: import shared.Parse; jpayne@68: import shared.Shared; jpayne@68: import shared.Tools; jpayne@68: import structures.AbstractBitSet; jpayne@68: import structures.ByteBuilder; jpayne@68: import structures.Heap; jpayne@68: import structures.IntHashMap; jpayne@68: import tax.TaxNode; jpayne@68: import tax.TaxTree; jpayne@68: jpayne@68: public class SketchSearcher extends SketchObject { jpayne@68: jpayne@68: public SketchSearcher(){ jpayne@68: jpayne@68: } jpayne@68: jpayne@68: public boolean parse(String arg, String a, String b, boolean addFileIfNotFound){ jpayne@68: jpayne@68: // System.err.println("Parsing "+arg+"; ref="+refFiles); //123 jpayne@68: jpayne@68: if(parseSketchFlags(arg, a, b)){ jpayne@68: //Do nothing jpayne@68: }else if(defaultParams.parse(arg, a, b)){ jpayne@68: //Do nothing jpayne@68: }else if(a.equals("verbose")){ jpayne@68: verbose=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("ref")){ jpayne@68: addRefFiles(b); jpayne@68: }else if(arg.equalsIgnoreCase("nt") || arg.equalsIgnoreCase("RefSeq") || arg.equalsIgnoreCase("refseqbig") || arg.equalsIgnoreCase("nr") jpayne@68: || arg.equalsIgnoreCase("img") || arg.equalsIgnoreCase("silva") || arg.equalsIgnoreCase("ribo") jpayne@68: || arg.equalsIgnoreCase("mito") || arg.equalsIgnoreCase("fungi") jpayne@68: || arg.equalsIgnoreCase("prokprot") || arg.equalsIgnoreCase("prokprotbig") || arg.equalsIgnoreCase("protein") || jpayne@68: arg.equalsIgnoreCase("protien") || a.equalsIgnoreCase("prot")){ jpayne@68: addRefFiles(arg); jpayne@68: }else if(a.equals("threads") || a.equals("sketchthreads") || a.equals("t")){ jpayne@68: threads=Integer.parseInt(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("minLevelExtended") || a.equalsIgnoreCase("minLevel")){ jpayne@68: minLevelExtended=TaxTree.parseLevelExtended(b); jpayne@68: }else if(a.equals("index") || a.equals("makeindex")){ jpayne@68: if(b!=null && "auto".equalsIgnoreCase(b)){ jpayne@68: autoIndex=true; jpayne@68: makeIndex=true; jpayne@68: }else{ jpayne@68: autoIndex=false; jpayne@68: makeIndex=Parse.parseBoolean(b); jpayne@68: } jpayne@68: }else if(a.equals("indexsize") || a.equals("indexlimit")){ jpayne@68: SketchIndex.indexLimit=Integer.parseInt(b); jpayne@68: } jpayne@68: jpayne@68: else if(b==null && arg.indexOf('=')<0 && addFileIfNotFound && (arg.indexOf(',')>=0 || new File(arg).exists())){ jpayne@68: addRefFiles(arg); jpayne@68: }else{ jpayne@68: return false; jpayne@68: } jpayne@68: // System.err.println("Parsed "+arg+"; ref="+refFiles); //123 jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: public boolean compare(ArrayList querySketches, ByteBuilder sb, DisplayParams params, int maxThreads){ jpayne@68: assert(params.postParsed); jpayne@68: final boolean json=params.json(); jpayne@68: ConcurrentHashMap map=new ConcurrentHashMap(); jpayne@68: jpayne@68: SketchResults[] alca=new SketchResults[querySketches.size()]; jpayne@68: jpayne@68: if(verbose2){System.err.println("At compare.");} jpayne@68: jpayne@68: boolean success=true; jpayne@68: final CompareBuffer buffer=new CompareBuffer(false); jpayne@68: AtomicInteger fakeID=new AtomicInteger(minFakeID); jpayne@68: for(int i=0; i1 && i==0){ jpayne@68: sb.append('['); jpayne@68: } jpayne@68: jpayne@68: sb.append(results.toText(params)); jpayne@68: jpayne@68: if(json && alca.length>1){ jpayne@68: if(i localRefSketches_, int pid_, int incr_, jpayne@68: AtomicInteger fakeID_, ConcurrentHashMap map_, DisplayParams params_){ jpayne@68: a=a_; jpayne@68: pid=pid_; jpayne@68: incr=incr_; jpayne@68: fakeID=fakeID_; jpayne@68: map=map_; jpayne@68: params=params_; jpayne@68: localRefSketches=localRefSketches_; jpayne@68: buffer=new CompareBuffer(params.needContamCounts()); jpayne@68: if(buffer.cbs!=null){buffer.cbs.setCapacity(a.length(), 0);} jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public void run(){ jpayne@68: if(a.length()<1 || a.length() map; jpayne@68: final CompareBuffer buffer; jpayne@68: final int incr; jpayne@68: final int pid; jpayne@68: final Sketch a; jpayne@68: final DisplayParams params; jpayne@68: final ArrayList localRefSketches; jpayne@68: long localComparisons=0; jpayne@68: jpayne@68: } jpayne@68: jpayne@68: public SketchResults processSketch(Sketch a, CompareBuffer buffer, AtomicInteger fakeID, jpayne@68: ConcurrentHashMap map, DisplayParams params, int maxThreads){ jpayne@68: if(a.length()<1 || a.length()1 && Shared.threads()>1 && sr.refSketchList.size()>31){ jpayne@68: if(verbose2){System.err.println("At processSketch 2.3");} //123 jpayne@68: assert((buffer.cbs==null)==(params.needContamCounts())); jpayne@68: spawnThreads(a, sr.refSketchList, fakeID, map, params, maxThreads); jpayne@68: if(verbose2){System.err.println("At processSketch 2.4");} //123 jpayne@68: }else{ jpayne@68: if(verbose2){System.err.println("At processSketch 2.5");} //123 jpayne@68: assert(buffer.cbs==null); jpayne@68: long comp=0; jpayne@68: for(Sketch b : sr.refSketchList){ jpayne@68: if(params.passesFilter(b)){ jpayne@68: comp++; jpayne@68: processPair(a, b, buffer, a.compareBitSet(), /*sr.taxHits,*/ fakeID, map, params); jpayne@68: } jpayne@68: } jpayne@68: comparisons.getAndAdd(comp); jpayne@68: if(verbose2){System.err.println("At processSketch 2.6");} //123 jpayne@68: } jpayne@68: if(verbose2){System.err.println("At processSketch 3");} //123 jpayne@68: jpayne@68: sr.addMap(map, params, buffer); jpayne@68: jpayne@68: fakeID.set(minFakeID); jpayne@68: map.clear(); jpayne@68: if(verbose2){System.err.println("At processSketch 4");} //123 jpayne@68: a.clearRefHitCounts(); jpayne@68: jpayne@68: return sr; jpayne@68: } jpayne@68: jpayne@68: //For remote homology jpayne@68: boolean passesTax(Sketch q, Sketch ref){ jpayne@68: assert(minLevelExtended>=0); jpayne@68: final int qid=q.taxID; jpayne@68: if(qid<0 || qid>=minFakeID){return false;} jpayne@68: TaxNode qtn=taxtree.getNode(qid); jpayne@68: if(qtn==null){return false;} jpayne@68: if(qtn.levelExtended>minLevelExtended){return false;} jpayne@68: final int rid=(ref==null ? -1 : ref.taxID); jpayne@68: if(rid>=0 && rid=minLevelExtended){ jpayne@68: return true; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: private void spawnThreads(Sketch a, ArrayList refs, AtomicInteger fakeID, jpayne@68: ConcurrentHashMap map, DisplayParams params, int maxThreads){ jpayne@68: final int toSpawn=Tools.max(1, Tools.min((refs.size()+7)/8, threads, maxThreads, Shared.threads())); jpayne@68: ArrayList alct=new ArrayList(toSpawn); jpayne@68: if(verbose2){System.err.println("At spawnThreads");} //123 jpayne@68: for(int t=0; t al, Sketch s, StringBuilder sb){ jpayne@68: // sb.append("\nResults for "+s.name()+":\n\n"); jpayne@68: // jpayne@68: // ArrayList tnl=new ArrayList(); jpayne@68: // for(Comparison c : al){ jpayne@68: // formatComparison(c, format, sb, printTax); jpayne@68: // } jpayne@68: // } jpayne@68: jpayne@68: boolean processPair(Sketch a, Sketch b, CompareBuffer buffer, AbstractBitSet abs, jpayne@68: AtomicInteger fakeID, ConcurrentHashMap map, DisplayParams params){ jpayne@68: // System.err.println("Comparing "+a.name()+" and "+b.name()); jpayne@68: assert(!params.printRefHits || a.refHitCounts()!=null || !SketchObject.makeIndex); jpayne@68: jpayne@68: jpayne@68: if(b.genomeSizeBases-1 && !passesTax(a, b)){return false;} jpayne@68: if(params.minSizeRatio>0){ jpayne@68: long sea=a.genomeSizeEstimate(); jpayne@68: long seb=b.genomeSizeEstimate(); jpayne@68: if(Tools.min(sea, seb)0){return false;} jpayne@68: jpayne@68: old=map.put(key, c); jpayne@68: while(old!=null && params.compare(old, c)>0){ jpayne@68: // System.err.println("B. Old: "+(old==null ? 0 : old.hits)+", new: "+c.hits); jpayne@68: c=old; jpayne@68: old=map.put(key, c); jpayne@68: } jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: // //TODO: Interestingly, the heap never seems to be created by anything... not sure what it's for. jpayne@68: // private static Comparison compareOneToOne(final Sketch a, final Sketch b, CompareBuffer buffer, AbstractBitSet abs, jpayne@68: // int minHits, float minWKID, float minANI, boolean aniFromWKID, Heap heap){ jpayne@68: //// assert(heap!=null); //Optional, for testing. jpayne@68: // if(a==b && !compareSelf){return null;} jpayne@68: // final int matches=a.countMatches(b, buffer, abs, true/*!makeIndex || !AUTOSIZE*/, null, -1); jpayne@68: // assert(matches==buffer.hits()); jpayne@68: // if(matches0){ jpayne@68: // final float ani=wkidToAni(xkid, a.k1Fraction()); jpayne@68: // if(animatches){return null;} //TODO: Should be based on score jpayne@68: // jpayne@68: //// System.err.print("*"); jpayne@68: // Comparison c=new Comparison(buffer, a, b); jpayne@68: // if(heap==null || heap.add(c)){return c;} jpayne@68: // return null; jpayne@68: // } jpayne@68: jpayne@68: //TODO: Interestingly, the heap never seems to be created by anything... not sure what it's for. jpayne@68: private static Comparison compareOneToOne(final Sketch a, final Sketch b, CompareBuffer buffer, AbstractBitSet abs, jpayne@68: int minHits, float minWKID, float minANI, boolean requireSSU, Heap heap){ jpayne@68: // assert(heap!=null); //Optional, for testing. jpayne@68: // assert(a.refHitCounts!=null); jpayne@68: if(a==b && !compareSelf){return null;} jpayne@68: if(requireSSU && !a.sharesSSU(b)){return null;} jpayne@68: final int matches=a.countMatches(b, buffer, abs, true/*!makeIndex || !AUTOSIZE*/, null, -1); jpayne@68: assert(matches==buffer.hits()); jpayne@68: if(matches0){ jpayne@68: final float ani=buffer.ani(); jpayne@68: if(animatches){return null;} //TODO: Should be based on score jpayne@68: jpayne@68: // System.err.print("*"); jpayne@68: Comparison c=new Comparison(buffer, a, b); jpayne@68: if(heap==null || heap.add(c)){return c;} jpayne@68: return null; jpayne@68: } jpayne@68: jpayne@68: public void addRefFiles(String a){ jpayne@68: if(a.equalsIgnoreCase("nr")){ jpayne@68: addRefFiles(NR_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.nrBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="nr";} jpayne@68: if(!setK){k=defaultKAmino; k2=defaultK2Amino;} jpayne@68: }else if(a.equalsIgnoreCase("nt")){ jpayne@68: addRefFiles(NT_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.ntBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="nt";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: }else if(a.equalsIgnoreCase("refseq")){ jpayne@68: addRefFiles(REFSEQ_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.refseqBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="RefSeq";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=2.0f;} jpayne@68: }else if(a.equalsIgnoreCase("refseqbig")){ jpayne@68: addRefFiles(REFSEQ_PATH_BIG()); jpayne@68: if(blacklist==null){blacklist=Blacklist.refseqBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="RefSeq";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=4.5f;} jpayne@68: }else if(a.equalsIgnoreCase("silva")){ jpayne@68: // TaxTree.SILVA_MODE=Parse.parseBoolean(b); jpayne@68: addRefFiles(SILVA_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.silvaBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="Silva";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: }else if(a.equalsIgnoreCase("img")){ jpayne@68: addRefFiles(IMG_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.imgBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="IMG";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: }else if(a.equalsIgnoreCase("prokprot") || a.equalsIgnoreCase("protein")){ jpayne@68: addRefFiles(PROKPROT_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.prokProtBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="ProkProt";} jpayne@68: if(!setK){k=defaultKAmino; k2=defaultK2Amino;} jpayne@68: if(!amino && !translate) { jpayne@68: translate=true; jpayne@68: System.err.println("Setting translate to true because a protein dataset is being used."); jpayne@68: } jpayne@68: if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=3.0f;} jpayne@68: }else if(a.equalsIgnoreCase("prokprotbig") || a.equalsIgnoreCase("proteinbig")){ jpayne@68: addRefFiles(PROKPROT_PATH_BIG()); jpayne@68: if(blacklist==null){blacklist=Blacklist.prokProtBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="ProkProt";} jpayne@68: if(!setK){k=defaultKAmino; k2=defaultK2Amino;} jpayne@68: if(!amino && !translate) { jpayne@68: translate=true; jpayne@68: System.err.println("Setting translate to true because a protein dataset is being used."); jpayne@68: } jpayne@68: if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=7.5f;} jpayne@68: }else if(a.equalsIgnoreCase("mito") || a.equalsIgnoreCase("refseqmito")){ jpayne@68: addRefFiles(MITO_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.mitoBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="RefSeqMito";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: }else if(a.equalsIgnoreCase("fungi") || a.equalsIgnoreCase("refseqfungi")){ jpayne@68: addRefFiles(FUNGI_PATH()); jpayne@68: if(blacklist==null){blacklist=Blacklist.fungiBlacklist();} jpayne@68: if(defaultParams.dbName==null){defaultParams.dbName="RefSeqFungi";} jpayne@68: if(!setK){k=defaultK; k2=defaultK2;} jpayne@68: }else{ jpayne@68: addFiles(a, refFiles); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: static void addFiles(String a, Set list){ jpayne@68: if(a==null){return;} jpayne@68: File f=new File(a); jpayne@68: assert(!list.contains(a)) : "Duplicate file "+a; jpayne@68: jpayne@68: if(f.exists()){ jpayne@68: list.add(a); jpayne@68: }else if(a.indexOf(',')>0){ jpayne@68: for(String s : a.split(",")){addFiles(s, list);} jpayne@68: }else if(a.indexOf('#')>=0 && new File(a.replaceFirst("#", "0")).exists()){ jpayne@68: for(int i=0; true; i++){ jpayne@68: String temp=a.replaceFirst("#", ""+i); jpayne@68: if(!new File(temp).exists()){break;} jpayne@68: list.add(temp); jpayne@68: } jpayne@68: }else{ jpayne@68: list.add(a); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public void makeIndex(){ jpayne@68: assert(index==null); jpayne@68: index=new SketchIndex(refSketches); jpayne@68: index.load(); jpayne@68: } jpayne@68: jpayne@68: public void loadReferences(int mode_, DisplayParams params){ jpayne@68: loadReferences(mode_, params.minKeyOccuranceCount, params.minEntropy, params.minProb, params.minQual); jpayne@68: } jpayne@68: jpayne@68: public void loadReferences(int mode_, int minKeyOccuranceCount, float minEntropy, float minProb, byte minQual) { jpayne@68: makeTool(minKeyOccuranceCount, false, false); jpayne@68: refSketches=tool.loadSketches_MT(mode_, 1f, -1, minEntropy, minProb, minQual, refFiles); jpayne@68: assert(refSketches!=null) : refFiles; jpayne@68: if(mode_==PER_FILE){ jpayne@68: Collections.sort(refSketches, SketchIdComparator.comparator); jpayne@68: } jpayne@68: taxIDToSketchIDMap=new IntHashMap(Tools.max(3, (int)(refSketches.size()*1.2f))); jpayne@68: for(int i=0; i0){ jpayne@68: taxIDToSketchIDMap.set(sk.taxID, i); jpayne@68: } jpayne@68: } jpayne@68: // System.err.println("Sketches: "+refSketches.get(0).name()); jpayne@68: if(makeIndex){ jpayne@68: makeIndex(); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public void makeTool(int minKeyOccuranceCount, boolean trackCounts, boolean mergePairs){ jpayne@68: if(tool==null){ jpayne@68: tool=new SketchTool(targetSketchSize, minKeyOccuranceCount, trackCounts, mergePairs); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public ArrayList loadSketchesFromString(String sketchString){ jpayne@68: return tool.loadSketchesFromString(sketchString); jpayne@68: } jpayne@68: jpayne@68: public int refFileCount(){return refFiles==null ? 0 : refFiles.size();} jpayne@68: public int refSketchCount(){return refSketches==null ? 0 : refSketches.size();} jpayne@68: jpayne@68: public Sketch findReferenceSketch(int taxID){ jpayne@68: if(taxID<1){return null;} jpayne@68: int skid=taxIDToSketchIDMap.get(taxID); jpayne@68: return skid<0 ? null : refSketches.get(skid); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public SketchIndex index=null; jpayne@68: public boolean autoIndex=true; jpayne@68: jpayne@68: public SketchTool tool=null; jpayne@68: public ArrayList refSketches; jpayne@68: LinkedHashSet refFiles=new LinkedHashSet(); jpayne@68: /** For ref sketch lookups by TaxID */ jpayne@68: private IntHashMap taxIDToSketchIDMap; jpayne@68: public int threads=Shared.threads(); jpayne@68: boolean verbose; jpayne@68: boolean errorState=false; jpayne@68: AtomicLong comparisons=new AtomicLong(0); jpayne@68: jpayne@68: int minLevelExtended=-1; jpayne@68: jpayne@68: }