comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchMakerMini.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package sketch;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7
8 import dna.AminoAcid;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import jgi.BBMerge;
12 import jgi.TranslateSixFrames;
13 import prok.CallGenes;
14 import prok.GeneCaller;
15 import prok.Orf;
16 import prok.ProkObject;
17 import shared.Parse;
18 import shared.Tools;
19 import stream.ConcurrentReadInputStream;
20 import stream.Read;
21 import structures.EntropyTracker;
22 import structures.ListNum;
23 import tax.TaxNode;
24 import tax.TaxTree;
25
26 /**
27 * Creates MinHashSketches rapidly.
28 *
29 * @author Brian Bushnell
30 * @date July 6, 2016
31 *
32 */
33 public class SketchMakerMini extends SketchObject {
34
35 /*--------------------------------------------------------------*/
36 /*---------------- Initialization ----------------*/
37 /*--------------------------------------------------------------*/
38
39 public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){
40 this(tool_, mode_, params.minEntropy, params.minProb, params.minQual);
41 }
42
43 /**
44 * Constructor.
45 */
46 public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){
47
48 tool=tool_;
49 mode=mode_;
50 minProb=minProb_;
51 minQual=minQual_;
52
53 aminoShift=AminoAcid.AMINO_SHIFT;
54 if(!aminoOrTranslate()){
55 shift=2*k;
56 shift2=shift-2;
57 mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
58 }else{
59 shift=aminoShift*k;
60 shift2=shift-aminoShift;
61 mask=(shift>63 ? -1L : ~((-1L)<<shift));
62 }
63
64 if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){
65 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts);
66 }else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){
67 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))),
68 tool.minKeyOccuranceCount, tool.trackCounts);
69 }else{
70 heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts);
71 }
72
73 if(minEntropy_>0){
74 eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true);
75 }else{
76 eTracker=null;
77 }
78
79 if(translate || processSSU){
80 gCaller=CallGenes.makeGeneCaller(pgm);
81 }else{
82 gCaller=null;
83 }
84 }
85
86 /*--------------------------------------------------------------*/
87 /*---------------- Outer Methods ----------------*/
88 /*--------------------------------------------------------------*/
89
90 /** Create read streams and process all data */
91 public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){
92 heap.clear(false); //123
93 final String simpleName;
94
95 final FileFormat ffin1, ffin2;
96 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
97 ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true);
98 ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true);
99 }else{
100 ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true);
101 ffin2=null;
102 }
103
104 //Create a read input stream
105 final ConcurrentReadInputStream cris;
106 {
107 simpleName=ffin1.simpleName();
108 heap.setFname(simpleName);
109 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
110 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
111 cris.start(); //Start the stream
112 if(verbose){outstream.println("Started cris");}
113 }
114 if(mode==ONE_SKETCH || mode==PER_FILE){
115 if(heap.name0()==null){heap.setName0(simpleName);}
116 }
117 ArrayList<Sketch> sketches=processInner(cris);
118
119 errorState|=ReadWrite.closeStream(cris);
120 sketchesMade++;
121 return sketches;
122 }
123
124 /*--------------------------------------------------------------*/
125 /*---------------- Inner Methods ----------------*/
126 /*--------------------------------------------------------------*/
127
128 /** Iterate through the reads */
129 ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){
130 assert(heap.size()==0);
131 ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8);
132
133 //Grab the first ListNum of reads
134 ListNum<Read> ln=cris.nextList();
135 //Grab the actual read list from the ListNum
136 ArrayList<Read> reads=(ln!=null ? ln.list : null);
137
138 //As long as there is a nonempty read list...
139 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
140 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
141
142 //Loop through each read in the list
143 for(int idx=0; idx<reads.size(); idx++){
144 final Read r1=reads.get(idx);
145 final Read r2=r1.mate;
146
147 processReadPair(r1, r2);
148 if(mode!=ONE_SKETCH && mode!=PER_FILE){
149 if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){
150 int size=heap.size();
151 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
152 assert(sketch.keys.length>0) : sketch.keys.length+", "+size;
153 sketch.loadSSU();
154 sketches.add(sketch);
155 sketchesMade++;
156 }
157 if(heap!=null){heap.clear(false);}
158 }
159 }
160
161 //Notify the input stream that the list was used
162 cris.returnList(ln);
163 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
164
165 //Fetch a new list
166 ln=cris.nextList();
167 reads=(ln!=null ? ln.list : null);
168 }
169
170 //Notify the input stream that the final list was used
171 if(ln!=null){
172 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
173 }
174
175 if(mode==ONE_SKETCH || mode==PER_FILE){
176 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
177 sketch.loadSSU();
178 sketches.add(sketch);
179 sketchesMade++;
180 }
181 heap.clear(true);
182 return sketches;
183 }
184
185 void processReadPair(Read r1, Read r2){
186 //Track the initial length for statistics
187 final int initialLength1=r1.length();
188 final int initialLength2=r1.mateLength();
189
190 //Increment counters
191 readsProcessed+=r1.pairCount();
192 basesProcessed+=initialLength1+initialLength2;
193
194 if(mode!=ONE_SKETCH && mode!=PER_FILE){
195 int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize);
196 if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);}
197 }
198
199 if(tool.mergePairs && r2!=null){
200 final int insert=BBMerge.findOverlapStrict(r1, r2, false);
201 if(insert>0){
202 heap.genomeSequences++;
203 r2.reverseComplement();
204 r1=r1.joinRead(insert);
205 r2=null;
206 }
207 }
208
209 processRead(r1);
210 if(r2!=null){processRead(r2);}
211
212 if(heap.name0()==null){
213 heap.setName0(r1.id);
214 }
215
216 TaxNode tn=null;
217 if(heap.taxID<0 && r1.length()>800){
218 if(taxtree!=null){
219 try {
220 tn=taxtree.parseNodeFromHeader(r1.id, true);
221 } catch (Throwable e) {}
222 if(tn!=null){
223 heap.taxID=tn.id;
224 if(heap.taxName()==null){
225 heap.setTaxName(tn.name);
226 }
227 }
228 // System.err.println("A) "+heap.taxID+r1.id);
229 }else{
230 heap.taxID=TaxTree.parseHeaderStatic(r1.id);
231 // System.err.println("B) "+heap.taxID+r1.id);
232 }
233 }
234 assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn;
235 }
236
237 public void processRead(final Read r){
238 if(amino){
239 processReadAmino(r);
240 }else if(translate){
241 processReadTranslated(r);
242 }else{
243 processReadNucleotide(r);
244 }
245 }
246
247 public void processReadTranslated(final Read r){
248 assert(!r.aminoacid());
249 final ArrayList<Read> prots;
250 if(sixframes){
251 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
252 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also
253 if(orf!=null && orf.length()>=min_SSU_len){
254 assert(orf.is16S());
255 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
256 }
257 //TODO: Add 18S.
258 }
259 prots=TranslateSixFrames.toFrames(r, true, false, 6);
260 }else{
261 ArrayList<Orf> list;
262 list=gCaller.callGenes(r);
263 prots=CallGenes.translate(r, list);
264 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
265 for(Orf orf : list){
266 if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){
267 heap.set16S(CallGenes.fetch(orf, r).bases);
268 break;
269 }
270 }
271 }
272 }
273 if(prots!=null){
274 for(Read p : prots){
275 processReadAmino(p);
276 }
277 }
278 }
279
280 void processReadNucleotide(final Read r){
281 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
282 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S
283 if(orf!=null && orf.length()>=min_SSU_len){
284 assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf;
285 assert(orf.is16S());
286 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
287 }
288 //TODO: Add 18S.
289 }
290
291 final byte[] bases=r.bases;
292 final byte[] quals=r.quality;
293 final long[] baseCounts=heap.baseCounts(true);
294 long kmer=0;
295 long rkmer=0;
296 int len=0;
297 assert(!r.aminoacid());
298
299 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
300 final long min=minHashValue;
301 heap.genomeSizeBases+=r.length();
302 heap.genomeSequences++;
303 if(eTracker!=null){eTracker.clear();}
304
305 // assert(false) : minProb+", "+minQual+", "+(quals==null);
306
307 if(quals==null || (minProb<=0 && minQual<2)){
308 // System.err.println("A");
309 for(int i=0; i<bases.length; i++){
310 // System.err.println("B: len="+len);
311 byte b=bases[i];
312 long x=AminoAcid.baseToNumber[b];
313 long x2=AminoAcid.baseToComplementNumber[b];
314
315 kmer=((kmer<<2)|x)&mask;
316 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
317 if(eTracker!=null){eTracker.add(b);}
318 if(x<0){
319 len=0;
320 rkmer=0;
321 }else{
322 len++;
323 baseCounts[(int)x]++;
324 }
325
326 // System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
327 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
328 // +len+", "+(char)b+", "+x+", "+x2+"\n");
329
330 if(len>=k){
331 kmersProcessed++;
332 heap.genomeSizeKmers++;
333 // heap.probSum++; //Note really necessary for fasta data
334 if(eTracker==null || eTracker.passes()){
335 // System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
336
337 // assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) :
338 // "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
339 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
340 // +len+", "+(char)b+", "+x+", "+x2;
341
342 final long hashcode=hash(kmer, rkmer);
343 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
344 if(hashcode>min){
345 if(noBlacklist){
346 heap.add(hashcode);
347 }else{
348 heap.checkAndAdd(hashcode);
349 }
350 }
351 }else{
352 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker);
353 // assert(false);
354 }
355 }
356 }
357 }else{
358 int zeroQualityKmers=0;
359 int positiveQualityKmers=0;
360
361 float prob=1;
362 for(int i=0; i<bases.length; i++){
363 final byte b=bases[i];
364 final long x=AminoAcid.baseToNumber[b];
365 final long x2=AminoAcid.baseToComplementNumber[b];
366
367 kmer=((kmer<<2)|x)&mask;
368 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
369 if(eTracker!=null){eTracker.add(b);}
370
371 final byte q=quals[i];
372 {//Quality-related stuff
373 assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual;
374 // if(x>=0){
375 // if(q>0){
376 // positiveQualityBases++;
377 // }else{
378 // zeroQualityBases++;
379 // }
380 // }
381 prob=prob*align2.QualityTools.PROB_CORRECT[q];
382 if(len>k){
383 byte oldq=quals[i-k];
384 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
385 }
386 if(x<0 || q<minQual){
387 len=0;
388 kmer=rkmer=0;
389 prob=1;
390 }else{
391 len++;
392 baseCounts[(int)x]++;
393 }
394 }
395
396 if(len>=k){
397 kmersProcessed++;
398 if(prob>=minProb){
399 heap.genomeSizeKmers++;
400 heap.probSum+=prob;
401 if(eTracker==null || eTracker.passes()){
402 final long hashcode=hash(kmer, rkmer);
403 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
404 if(hashcode>min){
405 if(noBlacklist){
406 heap.add(hashcode);
407 }else{
408 heap.checkAndAdd(hashcode);
409 }
410 }
411 }else{
412 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
413 }
414 positiveQualityKmers++;
415 }else if(q<=2){
416 zeroQualityKmers++;
417 }
418 }
419
420 //This version is slow but calculates depth better.
421 // if(len>=k){
422 // kmersProcessed++;
423 // heap.genomeSizeKmers++;
424 // final long hash=hash(kmer, rkmer);
425 // // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
426 // if(hash>min){
427 // if(prob>=minProb || (!heap.setMode && heap.contains(hash))){
428 // if(noBlacklist){
429 // heap.add(hash);
430 // }else{
431 // heap.checkAndAdd(hash);
432 // }
433 // }
434 // }
435 // }
436 }
437 if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){
438 if(looksLikePacBio(r)){
439 synchronized(this){
440 minProb=0;
441 }
442 processReadNucleotide(r);
443 }
444 }
445 }
446 // assert(false);
447 }
448
449 boolean looksLikePacBio(Read r){
450 if(r.length()<302 || r.mate!=null){return false;}
451 if(r.quality==null){
452 int x=Parse.parseZmw(r.id);
453 return x>=0;
454 }
455 int positive=0;
456 int zero=0;
457 int ns=0;
458 for(int i=0; i<r.bases.length; i++){
459 byte b=r.bases[i];
460 byte q=r.quality[i];
461 if(b=='N'){ns++;}
462 else if(q==0 || q==2){
463 zero++;
464 }else{
465 positive++;
466 }
467 }
468 return zero>=r.length()/2 && positive==0;
469 }
470
471 void processReadAmino(final Read r){
472 final byte[] bases=r.bases;
473 long kmer=0;
474 int len=0;
475 assert(r.aminoacid());
476
477 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
478 final long min=minHashValue;
479 heap.genomeSizeBases+=r.length();
480 heap.genomeSequences++;
481
482 for(int i=0; i<bases.length; i++){
483 byte b=bases[i];
484 long x=AminoAcid.acidToNumberNoStops[b];
485 kmer=((kmer<<aminoShift)|x)&mask;
486 // if(eTracker!=null){eTracker.add(b);}
487
488 if(x<0){len=0;}else{len++;}
489 if(len>=k){
490 kmersProcessed++;
491 heap.genomeSizeKmers++;
492 // if(eTracker==null || eTracker.passes()){
493 // assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r;
494 long hashcode=hash(kmer, kmer);
495 if(hashcode>min){
496 if(noBlacklist){
497 heap.add(hashcode);
498 }else{
499 heap.checkAndAdd(hashcode);
500 }
501 }
502 // }
503 }
504 }
505 }
506
507 void processReadAmino_old_no_entropy(final Read r){
508 final byte[] bases=r.bases;
509 long kmer=0;
510 int len=0;
511 assert(r.aminoacid());
512
513 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
514 final long min=minHashValue;
515 heap.genomeSizeBases+=r.length();
516 heap.genomeSequences++;
517
518 for(int i=0; i<bases.length; i++){
519 byte b=bases[i];
520 long x=AminoAcid.acidToNumberNoStops[b];
521 kmer=((kmer<<aminoShift)|x)&mask;
522 if(x<0){len=0;}else{len++;}
523 if(len>=k){
524 kmersProcessed++;
525 heap.genomeSizeKmers++;
526 long hashcode=hash(kmer, kmer);
527 if(hashcode>min){
528 if(noBlacklist){
529 heap.add(hashcode);
530 }else{
531 heap.checkAndAdd(hashcode);
532 }
533 }
534 }
535 }
536 }
537
538 public Sketch toSketch(int minCount){
539 Sketch sketch=null;
540 if(heap!=null && heap.size()>0){
541 try {
542 sketch=new Sketch(heap, false, tool.trackCounts, null, minCount);
543 } catch (Throwable e) {
544 // TODO Auto-generated catch block
545 e.printStackTrace();
546 }
547 heap.clear(false);
548 }
549 return sketch;
550 }
551
552 public void add(SketchMakerMini smm){
553 heap.add(smm.heap);
554 readsProcessed+=smm.readsProcessed;
555 basesProcessed+=smm.basesProcessed;
556 kmersProcessed+=smm.kmersProcessed;
557 sketchesMade+=smm.sketchesMade;
558 pacBioDetected|=smm.pacBioDetected;
559 }
560
561 /** True only if this thread has completed successfully */
562 boolean success=false;
563
564 SketchHeap heap;
565
566 final int aminoShift;
567 final int shift;
568 final int shift2;
569 final long mask;
570 final EntropyTracker eTracker;
571 final GeneCaller gCaller;
572
573 public float minEntropy() {
574 // TODO Auto-generated method stub
575 return eTracker==null ? -1 : eTracker.cutoff();
576 }
577
578 /*--------------------------------------------------------------*/
579 /*---------------- Fields ----------------*/
580 /*--------------------------------------------------------------*/
581
582 /** Number of reads processed */
583 protected long readsProcessed=0;
584 /** Number of bases processed */
585 protected long basesProcessed=0;
586 /** Number of bases processed */
587 protected long kmersProcessed=0;
588 /** Number of sketches started */
589 protected long sketchesMade=0;
590
591 float minProb() {return minProb;}
592 byte minQual() {return minQual;}
593 public boolean pacBioDetected=false;
594 private float minProb;
595 private byte minQual;
596
597 /*--------------------------------------------------------------*/
598 /*---------------- Final Fields ----------------*/
599 /*--------------------------------------------------------------*/
600
601 private final SketchTool tool;
602 final int mode;
603
604 /*--------------------------------------------------------------*/
605 /*---------------- Common Fields ----------------*/
606 /*--------------------------------------------------------------*/
607
608 /** Print status messages to this output stream */
609 private PrintStream outstream=System.err;
610 /** Print verbose messages */
611 public static boolean verbose=false;
612 /** True if an error was encountered */
613 public boolean errorState=false;
614
615 }