comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchSearcher.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package sketch;
2
3 import java.io.File;
4 import java.util.ArrayList;
5 import java.util.Collections;
6 import java.util.LinkedHashSet;
7 import java.util.Set;
8 import java.util.concurrent.ConcurrentHashMap;
9 import java.util.concurrent.atomic.AtomicInteger;
10 import java.util.concurrent.atomic.AtomicLong;
11
12 import shared.Parse;
13 import shared.Shared;
14 import shared.Tools;
15 import structures.AbstractBitSet;
16 import structures.ByteBuilder;
17 import structures.Heap;
18 import structures.IntHashMap;
19 import tax.TaxNode;
20 import tax.TaxTree;
21
22 public class SketchSearcher extends SketchObject {
23
24 public SketchSearcher(){
25
26 }
27
28 public boolean parse(String arg, String a, String b, boolean addFileIfNotFound){
29
30 // System.err.println("Parsing "+arg+"; ref="+refFiles); //123
31
32 if(parseSketchFlags(arg, a, b)){
33 //Do nothing
34 }else if(defaultParams.parse(arg, a, b)){
35 //Do nothing
36 }else if(a.equals("verbose")){
37 verbose=Parse.parseBoolean(b);
38 }else if(a.equals("ref")){
39 addRefFiles(b);
40 }else if(arg.equalsIgnoreCase("nt") || arg.equalsIgnoreCase("RefSeq") || arg.equalsIgnoreCase("refseqbig") || arg.equalsIgnoreCase("nr")
41 || arg.equalsIgnoreCase("img") || arg.equalsIgnoreCase("silva") || arg.equalsIgnoreCase("ribo")
42 || arg.equalsIgnoreCase("mito") || arg.equalsIgnoreCase("fungi")
43 || arg.equalsIgnoreCase("prokprot") || arg.equalsIgnoreCase("prokprotbig") || arg.equalsIgnoreCase("protein") ||
44 arg.equalsIgnoreCase("protien") || a.equalsIgnoreCase("prot")){
45 addRefFiles(arg);
46 }else if(a.equals("threads") || a.equals("sketchthreads") || a.equals("t")){
47 threads=Integer.parseInt(b);
48 }
49
50 else if(a.equalsIgnoreCase("minLevelExtended") || a.equalsIgnoreCase("minLevel")){
51 minLevelExtended=TaxTree.parseLevelExtended(b);
52 }else if(a.equals("index") || a.equals("makeindex")){
53 if(b!=null && "auto".equalsIgnoreCase(b)){
54 autoIndex=true;
55 makeIndex=true;
56 }else{
57 autoIndex=false;
58 makeIndex=Parse.parseBoolean(b);
59 }
60 }else if(a.equals("indexsize") || a.equals("indexlimit")){
61 SketchIndex.indexLimit=Integer.parseInt(b);
62 }
63
64 else if(b==null && arg.indexOf('=')<0 && addFileIfNotFound && (arg.indexOf(',')>=0 || new File(arg).exists())){
65 addRefFiles(arg);
66 }else{
67 return false;
68 }
69 // System.err.println("Parsed "+arg+"; ref="+refFiles); //123
70 return true;
71 }
72
73 public boolean compare(ArrayList<Sketch> querySketches, ByteBuilder sb, DisplayParams params, int maxThreads){
74 assert(params.postParsed);
75 final boolean json=params.json();
76 ConcurrentHashMap<Integer, Comparison> map=new ConcurrentHashMap<Integer, Comparison>();
77
78 SketchResults[] alca=new SketchResults[querySketches.size()];
79
80 if(verbose2){System.err.println("At compare.");}
81
82 boolean success=true;
83 final CompareBuffer buffer=new CompareBuffer(false);
84 AtomicInteger fakeID=new AtomicInteger(minFakeID);
85 for(int i=0; i<querySketches.size(); i++){
86 fakeID.set(minFakeID);
87 Sketch a=querySketches.get(i);
88
89 SketchResults results=processSketch(a, buffer, fakeID, map, params, maxThreads);
90 a.clearRefHitCounts();
91 alca[i]=results;
92 // System.out.println(a.present);
93 }
94
95 if(verbose2){System.err.println("Made results.");}
96
97 for(int i=0; i<alca.length; i++){
98 // Sketch s=sketches.get(i);
99 SketchResults results=alca[i];
100
101 if(json && alca.length>1 && i==0){
102 sb.append('[');
103 }
104
105 sb.append(results.toText(params));
106
107 if(json && alca.length>1){
108 if(i<alca.length-1){
109 sb.append(',');
110 }else{
111 sb.append(']');
112 }
113 }
114 }
115 return success;
116 }
117
118 private class CompareThread extends Thread {
119
120 CompareThread(Sketch a_, ArrayList<Sketch> localRefSketches_, int pid_, int incr_,
121 AtomicInteger fakeID_, ConcurrentHashMap<Integer, Comparison> map_, DisplayParams params_){
122 a=a_;
123 pid=pid_;
124 incr=incr_;
125 fakeID=fakeID_;
126 map=map_;
127 params=params_;
128 localRefSketches=localRefSketches_;
129 buffer=new CompareBuffer(params.needContamCounts());
130 if(buffer.cbs!=null){buffer.cbs.setCapacity(a.length(), 0);}
131 }
132
133 @Override
134 public void run(){
135 if(a.length()<1 || a.length()<params.minHits || (params.requireSSU && !a.hasSSU())){return;}//TODO: Change to 'require16S'
136 assert(a.compareBitSet()==null || buffer.cbs!=null) : (a.compareBitSet()==null)+", "+(buffer.cbs==null); //Unsafe to use a.cbs multithreaded unless atomic
137 final AbstractBitSet cbs=(buffer.cbs==null ? a.compareBitSet() : buffer.cbs);
138 for(int i=pid; i<localRefSketches.size(); i+=incr){
139 Sketch b=localRefSketches.get(i);
140 if(params.passesFilter(b)){
141 processPair(a, b, buffer, cbs, fakeID, map, params);
142 localComparisons++;
143 }
144 }
145 comparisons.getAndAdd(localComparisons);
146 }
147
148 final AtomicInteger fakeID;
149 final ConcurrentHashMap<Integer, Comparison> map;
150 final CompareBuffer buffer;
151 final int incr;
152 final int pid;
153 final Sketch a;
154 final DisplayParams params;
155 final ArrayList<Sketch> localRefSketches;
156 long localComparisons=0;
157
158 }
159
160 public SketchResults processSketch(Sketch a, CompareBuffer buffer, AtomicInteger fakeID,
161 ConcurrentHashMap<Integer, Comparison> map, DisplayParams params, int maxThreads){
162 if(a.length()<1 || a.length()<params.minHits || (params.requireSSU && !a.hasSSU())){return new SketchResults(a);}
163 // Timer t=new Timer();
164 // t.start("Began query.");
165 assert(a.compareBitSet()==null);
166 assert(a.indexBitSet()==null);
167
168 if(verbose2){System.err.println("At processSketch 1");} //123
169
170 a.makeBitSets(params.needContamCounts(), index!=null);
171
172 final SketchResults sr;
173 if(index!=null){
174 sr=index.getSketches(a, params);
175 }else{
176 sr=new SketchResults(a, refSketches, null);
177 }
178
179 if(verbose2){System.err.println("At processSketch 2");} //123
180
181 if(sr==null || sr.refSketchList==null || sr.refSketchList.isEmpty()){
182 if(verbose2){System.err.println("At processSketch 2.0");} //123
183 return sr;
184 }
185
186 if(verbose2){System.err.println("At processSketch 2.1");} //123
187
188 if(verbose2){System.err.println("At processSketch 2.2");} //123
189
190 if(maxThreads>1 && Shared.threads()>1 && sr.refSketchList.size()>31){
191 if(verbose2){System.err.println("At processSketch 2.3");} //123
192 assert((buffer.cbs==null)==(params.needContamCounts()));
193 spawnThreads(a, sr.refSketchList, fakeID, map, params, maxThreads);
194 if(verbose2){System.err.println("At processSketch 2.4");} //123
195 }else{
196 if(verbose2){System.err.println("At processSketch 2.5");} //123
197 assert(buffer.cbs==null);
198 long comp=0;
199 for(Sketch b : sr.refSketchList){
200 if(params.passesFilter(b)){
201 comp++;
202 processPair(a, b, buffer, a.compareBitSet(), /*sr.taxHits,*/ fakeID, map, params);
203 }
204 }
205 comparisons.getAndAdd(comp);
206 if(verbose2){System.err.println("At processSketch 2.6");} //123
207 }
208 if(verbose2){System.err.println("At processSketch 3");} //123
209
210 sr.addMap(map, params, buffer);
211
212 fakeID.set(minFakeID);
213 map.clear();
214 if(verbose2){System.err.println("At processSketch 4");} //123
215 a.clearRefHitCounts();
216
217 return sr;
218 }
219
220 //For remote homology
221 boolean passesTax(Sketch q, Sketch ref){
222 assert(minLevelExtended>=0);
223 final int qid=q.taxID;
224 if(qid<0 || qid>=minFakeID){return false;}
225 TaxNode qtn=taxtree.getNode(qid);
226 if(qtn==null){return false;}
227 if(qtn.levelExtended>minLevelExtended){return false;}
228 final int rid=(ref==null ? -1 : ref.taxID);
229 if(rid>=0 && rid<minFakeID){
230 TaxNode rtn=taxtree.getNode(rid);
231 if(rtn!=null && rtn.levelExtended<=minLevelExtended){
232 TaxNode ancestor=taxtree.commonAncestor(qtn, rtn);
233 if(ancestor!=null && ancestor.levelExtended>=minLevelExtended){
234 return true;
235 }
236 }
237 }
238 return false;
239 }
240
241 private void spawnThreads(Sketch a, ArrayList<Sketch> refs, AtomicInteger fakeID,
242 ConcurrentHashMap<Integer, Comparison> map, DisplayParams params, int maxThreads){
243 final int toSpawn=Tools.max(1, Tools.min((refs.size()+7)/8, threads, maxThreads, Shared.threads()));
244 ArrayList<CompareThread> alct=new ArrayList<CompareThread>(toSpawn);
245 if(verbose2){System.err.println("At spawnThreads");} //123
246 for(int t=0; t<toSpawn; t++){
247 alct.add(new CompareThread(a, refs, t, toSpawn, fakeID, map, params));
248 }
249 for(CompareThread ct : alct){ct.start();}
250 for(CompareThread ct : alct){
251
252 //Wait until this thread has terminated
253 while(ct.getState()!=Thread.State.TERMINATED){
254 try {
255 //Attempt a join operation
256 ct.join();
257 } catch (InterruptedException e) {
258 e.printStackTrace();
259 }
260 }
261 }
262 if(params.needContamCounts()){
263 for(CompareThread ct : alct){
264 if(ct.buffer.cbs==null){
265 assert((AUTOSIZE || AUTOSIZE_LINEAR) && index!=null);//Not really what this does
266 break;
267 }
268 a.addToBitSet(ct.buffer.cbs);
269 }
270 }
271 a.clearRefHitCounts();
272 alct=null;
273 }
274
275 // private void writeResults(ArrayList<Comparison> al, Sketch s, StringBuilder sb){
276 // sb.append("\nResults for "+s.name()+":\n\n");
277 //
278 // ArrayList<TaxNode> tnl=new ArrayList<TaxNode>();
279 // for(Comparison c : al){
280 // formatComparison(c, format, sb, printTax);
281 // }
282 // }
283
284 boolean processPair(Sketch a, Sketch b, CompareBuffer buffer, AbstractBitSet abs,
285 AtomicInteger fakeID, ConcurrentHashMap<Integer, Comparison> map, DisplayParams params){
286 // System.err.println("Comparing "+a.name()+" and "+b.name());
287 assert(!params.printRefHits || a.refHitCounts()!=null || !SketchObject.makeIndex);
288
289
290 if(b.genomeSizeBases<params.minBases){return false;}
291 if(minLevelExtended>-1 && !passesTax(a, b)){return false;}
292 if(params.minSizeRatio>0){
293 long sea=a.genomeSizeEstimate();
294 long seb=b.genomeSizeEstimate();
295 if(Tools.min(sea, seb)<params.minSizeRatio*Tools.max(sea, seb)){return false;}
296 }
297 Comparison c=compareOneToOne(a, b, buffer, abs, /*taxHits, params.contamLevel(),*/ params.minHits, params.minWKID, params.minANI, params.requireSSU, null);
298 if(c==null){return false;}
299 if(c.taxID()<1){c.taxID=fakeID.getAndIncrement();}
300
301 // System.err.println("TID: "+c.taxID()+", "+fakeID);
302
303 TaxNode tn=(taxtree==null ? null : taxtree.getNode(b.taxID));
304 if(tn!=null){
305 c.taxName=tn.name;
306 if(tn.level<params.taxLevel){
307 TaxNode tn2=taxtree.getNodeAtLevel(b.taxID, params.taxLevel);
308 tn=tn2;
309 }
310 }
311 Integer key=(tn==null ? c.taxID : tn.id);
312
313 Comparison old=map.get(key);
314 // System.err.println("A. Old: "+(old==null ? 0 : old.hits)+", new: "+c.hits);
315 if(old!=null && params.compare(old, c)>0){return false;}
316
317 old=map.put(key, c);
318 while(old!=null && params.compare(old, c)>0){
319 // System.err.println("B. Old: "+(old==null ? 0 : old.hits)+", new: "+c.hits);
320 c=old;
321 old=map.put(key, c);
322 }
323 return true;
324 }
325
326 // //TODO: Interestingly, the heap never seems to be created by anything... not sure what it's for.
327 // private static Comparison compareOneToOne(final Sketch a, final Sketch b, CompareBuffer buffer, AbstractBitSet abs,
328 // int minHits, float minWKID, float minANI, boolean aniFromWKID, Heap<Comparison> heap){
329 //// assert(heap!=null); //Optional, for testing.
330 // if(a==b && !compareSelf){return null;}
331 // final int matches=a.countMatches(b, buffer, abs, true/*!makeIndex || !AUTOSIZE*/, null, -1);
332 // assert(matches==buffer.hits());
333 // if(matches<minHits){return null;}
334 //// asdf //TODO: handle k1 and k2 WKIDs here.
335 // {
336 //// final int div=aniFromWKID ? buffer.minDivisor() : buffer.maxDivisor();
337 //// final float xkid=matches/(float)div;//This could be kid or wkid at this point...
338 //// if(xkid<minWKID){return null;}
339 //
340 // final int div=aniFromWKID ? buffer.minDivisor() : buffer.maxDivisor();
341 // final float xkid=matches/(float)div;//This could be kid or wkid at this point...
342 // if(xkid<minWKID){return null;}
343 //
344 // //TODO (?) This is only necessary because of the order of setting minwkid and minani.
345 // //minWKID can be deterministically determined from minANI so if it is set correctly this can be skipped.
346 // if(minANI>0){
347 // final float ani=wkidToAni(xkid, a.k1Fraction());
348 // if(ani<minANI){return null;}
349 // }
350 // }
351 //
352 // if(heap!=null && !heap.hasRoom() && heap.peek().hits()>matches){return null;} //TODO: Should be based on score
353 //
354 //// System.err.print("*");
355 // Comparison c=new Comparison(buffer, a, b);
356 // if(heap==null || heap.add(c)){return c;}
357 // return null;
358 // }
359
360 //TODO: Interestingly, the heap never seems to be created by anything... not sure what it's for.
361 private static Comparison compareOneToOne(final Sketch a, final Sketch b, CompareBuffer buffer, AbstractBitSet abs,
362 int minHits, float minWKID, float minANI, boolean requireSSU, Heap<Comparison> heap){
363 // assert(heap!=null); //Optional, for testing.
364 // assert(a.refHitCounts!=null);
365 if(a==b && !compareSelf){return null;}
366 if(requireSSU && !a.sharesSSU(b)){return null;}
367 final int matches=a.countMatches(b, buffer, abs, true/*!makeIndex || !AUTOSIZE*/, null, -1);
368 assert(matches==buffer.hits());
369 if(matches<minHits){return null;}
370
371 {
372 final float wkid=buffer.wkid();
373 if(wkid<minWKID){return null;}
374
375 if(minANI>0){
376 final float ani=buffer.ani();
377 if(ani<minANI){return null;}
378 }
379 }
380
381 if(heap!=null && !heap.hasRoom() && heap.peek().hits()>matches){return null;} //TODO: Should be based on score
382
383 // System.err.print("*");
384 Comparison c=new Comparison(buffer, a, b);
385 if(heap==null || heap.add(c)){return c;}
386 return null;
387 }
388
389 public void addRefFiles(String a){
390 if(a.equalsIgnoreCase("nr")){
391 addRefFiles(NR_PATH());
392 if(blacklist==null){blacklist=Blacklist.nrBlacklist();}
393 if(defaultParams.dbName==null){defaultParams.dbName="nr";}
394 if(!setK){k=defaultKAmino; k2=defaultK2Amino;}
395 }else if(a.equalsIgnoreCase("nt")){
396 addRefFiles(NT_PATH());
397 if(blacklist==null){blacklist=Blacklist.ntBlacklist();}
398 if(defaultParams.dbName==null){defaultParams.dbName="nt";}
399 if(!setK){k=defaultK; k2=defaultK2;}
400 }else if(a.equalsIgnoreCase("refseq")){
401 addRefFiles(REFSEQ_PATH());
402 if(blacklist==null){blacklist=Blacklist.refseqBlacklist();}
403 if(defaultParams.dbName==null){defaultParams.dbName="RefSeq";}
404 if(!setK){k=defaultK; k2=defaultK2;}
405 if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=2.0f;}
406 }else if(a.equalsIgnoreCase("refseqbig")){
407 addRefFiles(REFSEQ_PATH_BIG());
408 if(blacklist==null){blacklist=Blacklist.refseqBlacklist();}
409 if(defaultParams.dbName==null){defaultParams.dbName="RefSeq";}
410 if(!setK){k=defaultK; k2=defaultK2;}
411 if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=4.5f;}
412 }else if(a.equalsIgnoreCase("silva")){
413 // TaxTree.SILVA_MODE=Parse.parseBoolean(b);
414 addRefFiles(SILVA_PATH());
415 if(blacklist==null){blacklist=Blacklist.silvaBlacklist();}
416 if(defaultParams.dbName==null){defaultParams.dbName="Silva";}
417 if(!setK){k=defaultK; k2=defaultK2;}
418 }else if(a.equalsIgnoreCase("img")){
419 addRefFiles(IMG_PATH());
420 if(blacklist==null){blacklist=Blacklist.imgBlacklist();}
421 if(defaultParams.dbName==null){defaultParams.dbName="IMG";}
422 if(!setK){k=defaultK; k2=defaultK2;}
423 }else if(a.equalsIgnoreCase("prokprot") || a.equalsIgnoreCase("protein")){
424 addRefFiles(PROKPROT_PATH());
425 if(blacklist==null){blacklist=Blacklist.prokProtBlacklist();}
426 if(defaultParams.dbName==null){defaultParams.dbName="ProkProt";}
427 if(!setK){k=defaultKAmino; k2=defaultK2Amino;}
428 if(!amino && !translate) {
429 translate=true;
430 System.err.println("Setting translate to true because a protein dataset is being used.");
431 }
432 if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=3.0f;}
433 }else if(a.equalsIgnoreCase("prokprotbig") || a.equalsIgnoreCase("proteinbig")){
434 addRefFiles(PROKPROT_PATH_BIG());
435 if(blacklist==null){blacklist=Blacklist.prokProtBlacklist();}
436 if(defaultParams.dbName==null){defaultParams.dbName="ProkProt";}
437 if(!setK){k=defaultKAmino; k2=defaultK2Amino;}
438 if(!amino && !translate) {
439 translate=true;
440 System.err.println("Setting translate to true because a protein dataset is being used.");
441 }
442 if(!SET_AUTOSIZE_FACTOR){AUTOSIZE_FACTOR=7.5f;}
443 }else if(a.equalsIgnoreCase("mito") || a.equalsIgnoreCase("refseqmito")){
444 addRefFiles(MITO_PATH());
445 if(blacklist==null){blacklist=Blacklist.mitoBlacklist();}
446 if(defaultParams.dbName==null){defaultParams.dbName="RefSeqMito";}
447 if(!setK){k=defaultK; k2=defaultK2;}
448 }else if(a.equalsIgnoreCase("fungi") || a.equalsIgnoreCase("refseqfungi")){
449 addRefFiles(FUNGI_PATH());
450 if(blacklist==null){blacklist=Blacklist.fungiBlacklist();}
451 if(defaultParams.dbName==null){defaultParams.dbName="RefSeqFungi";}
452 if(!setK){k=defaultK; k2=defaultK2;}
453 }else{
454 addFiles(a, refFiles);
455 }
456 }
457
458 static void addFiles(String a, Set<String> list){
459 if(a==null){return;}
460 File f=new File(a);
461 assert(!list.contains(a)) : "Duplicate file "+a;
462
463 if(f.exists()){
464 list.add(a);
465 }else if(a.indexOf(',')>0){
466 for(String s : a.split(",")){addFiles(s, list);}
467 }else if(a.indexOf('#')>=0 && new File(a.replaceFirst("#", "0")).exists()){
468 for(int i=0; true; i++){
469 String temp=a.replaceFirst("#", ""+i);
470 if(!new File(temp).exists()){break;}
471 list.add(temp);
472 }
473 }else{
474 list.add(a);
475 }
476 }
477
478 public void makeIndex(){
479 assert(index==null);
480 index=new SketchIndex(refSketches);
481 index.load();
482 }
483
484 public void loadReferences(int mode_, DisplayParams params){
485 loadReferences(mode_, params.minKeyOccuranceCount, params.minEntropy, params.minProb, params.minQual);
486 }
487
488 public void loadReferences(int mode_, int minKeyOccuranceCount, float minEntropy, float minProb, byte minQual) {
489 makeTool(minKeyOccuranceCount, false, false);
490 refSketches=tool.loadSketches_MT(mode_, 1f, -1, minEntropy, minProb, minQual, refFiles);
491 assert(refSketches!=null) : refFiles;
492 if(mode_==PER_FILE){
493 Collections.sort(refSketches, SketchIdComparator.comparator);
494 }
495 taxIDToSketchIDMap=new IntHashMap(Tools.max(3, (int)(refSketches.size()*1.2f)));
496 for(int i=0; i<refSketches.size(); i++){
497 Sketch sk=refSketches.get(i);
498 if(sk!=null && sk.taxID>0){
499 taxIDToSketchIDMap.set(sk.taxID, i);
500 }
501 }
502 // System.err.println("Sketches: "+refSketches.get(0).name());
503 if(makeIndex){
504 makeIndex();
505 }
506 }
507
508 public void makeTool(int minKeyOccuranceCount, boolean trackCounts, boolean mergePairs){
509 if(tool==null){
510 tool=new SketchTool(targetSketchSize, minKeyOccuranceCount, trackCounts, mergePairs);
511 }
512 }
513
514 public ArrayList<Sketch> loadSketchesFromString(String sketchString){
515 return tool.loadSketchesFromString(sketchString);
516 }
517
518 public int refFileCount(){return refFiles==null ? 0 : refFiles.size();}
519 public int refSketchCount(){return refSketches==null ? 0 : refSketches.size();}
520
521 public Sketch findReferenceSketch(int taxID){
522 if(taxID<1){return null;}
523 int skid=taxIDToSketchIDMap.get(taxID);
524 return skid<0 ? null : refSketches.get(skid);
525 }
526
527 /*--------------------------------------------------------------*/
528
529 public SketchIndex index=null;
530 public boolean autoIndex=true;
531
532 public SketchTool tool=null;
533 public ArrayList<Sketch> refSketches;
534 LinkedHashSet<String> refFiles=new LinkedHashSet<String>();
535 /** For ref sketch lookups by TaxID */
536 private IntHashMap taxIDToSketchIDMap;
537 public int threads=Shared.threads();
538 boolean verbose;
539 boolean errorState=false;
540 AtomicLong comparisons=new AtomicLong(0);
541
542 int minLevelExtended=-1;
543
544 }