annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchMakerMini.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package sketch;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4 import java.io.PrintStream;
jpayne@68 5 import java.util.ArrayList;
jpayne@68 6 import java.util.Arrays;
jpayne@68 7
jpayne@68 8 import dna.AminoAcid;
jpayne@68 9 import fileIO.FileFormat;
jpayne@68 10 import fileIO.ReadWrite;
jpayne@68 11 import jgi.BBMerge;
jpayne@68 12 import jgi.TranslateSixFrames;
jpayne@68 13 import prok.CallGenes;
jpayne@68 14 import prok.GeneCaller;
jpayne@68 15 import prok.Orf;
jpayne@68 16 import prok.ProkObject;
jpayne@68 17 import shared.Parse;
jpayne@68 18 import shared.Tools;
jpayne@68 19 import stream.ConcurrentReadInputStream;
jpayne@68 20 import stream.Read;
jpayne@68 21 import structures.EntropyTracker;
jpayne@68 22 import structures.ListNum;
jpayne@68 23 import tax.TaxNode;
jpayne@68 24 import tax.TaxTree;
jpayne@68 25
jpayne@68 26 /**
jpayne@68 27 * Creates MinHashSketches rapidly.
jpayne@68 28 *
jpayne@68 29 * @author Brian Bushnell
jpayne@68 30 * @date July 6, 2016
jpayne@68 31 *
jpayne@68 32 */
jpayne@68 33 public class SketchMakerMini extends SketchObject {
jpayne@68 34
jpayne@68 35 /*--------------------------------------------------------------*/
jpayne@68 36 /*---------------- Initialization ----------------*/
jpayne@68 37 /*--------------------------------------------------------------*/
jpayne@68 38
jpayne@68 39 public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){
jpayne@68 40 this(tool_, mode_, params.minEntropy, params.minProb, params.minQual);
jpayne@68 41 }
jpayne@68 42
jpayne@68 43 /**
jpayne@68 44 * Constructor.
jpayne@68 45 */
jpayne@68 46 public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){
jpayne@68 47
jpayne@68 48 tool=tool_;
jpayne@68 49 mode=mode_;
jpayne@68 50 minProb=minProb_;
jpayne@68 51 minQual=minQual_;
jpayne@68 52
jpayne@68 53 aminoShift=AminoAcid.AMINO_SHIFT;
jpayne@68 54 if(!aminoOrTranslate()){
jpayne@68 55 shift=2*k;
jpayne@68 56 shift2=shift-2;
jpayne@68 57 mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
jpayne@68 58 }else{
jpayne@68 59 shift=aminoShift*k;
jpayne@68 60 shift2=shift-aminoShift;
jpayne@68 61 mask=(shift>63 ? -1L : ~((-1L)<<shift));
jpayne@68 62 }
jpayne@68 63
jpayne@68 64 if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){
jpayne@68 65 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts);
jpayne@68 66 }else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){
jpayne@68 67 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))),
jpayne@68 68 tool.minKeyOccuranceCount, tool.trackCounts);
jpayne@68 69 }else{
jpayne@68 70 heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts);
jpayne@68 71 }
jpayne@68 72
jpayne@68 73 if(minEntropy_>0){
jpayne@68 74 eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true);
jpayne@68 75 }else{
jpayne@68 76 eTracker=null;
jpayne@68 77 }
jpayne@68 78
jpayne@68 79 if(translate || processSSU){
jpayne@68 80 gCaller=CallGenes.makeGeneCaller(pgm);
jpayne@68 81 }else{
jpayne@68 82 gCaller=null;
jpayne@68 83 }
jpayne@68 84 }
jpayne@68 85
jpayne@68 86 /*--------------------------------------------------------------*/
jpayne@68 87 /*---------------- Outer Methods ----------------*/
jpayne@68 88 /*--------------------------------------------------------------*/
jpayne@68 89
jpayne@68 90 /** Create read streams and process all data */
jpayne@68 91 public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){
jpayne@68 92 heap.clear(false); //123
jpayne@68 93 final String simpleName;
jpayne@68 94
jpayne@68 95 final FileFormat ffin1, ffin2;
jpayne@68 96 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
jpayne@68 97 ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true);
jpayne@68 98 ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true);
jpayne@68 99 }else{
jpayne@68 100 ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true);
jpayne@68 101 ffin2=null;
jpayne@68 102 }
jpayne@68 103
jpayne@68 104 //Create a read input stream
jpayne@68 105 final ConcurrentReadInputStream cris;
jpayne@68 106 {
jpayne@68 107 simpleName=ffin1.simpleName();
jpayne@68 108 heap.setFname(simpleName);
jpayne@68 109 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
jpayne@68 110 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
jpayne@68 111 cris.start(); //Start the stream
jpayne@68 112 if(verbose){outstream.println("Started cris");}
jpayne@68 113 }
jpayne@68 114 if(mode==ONE_SKETCH || mode==PER_FILE){
jpayne@68 115 if(heap.name0()==null){heap.setName0(simpleName);}
jpayne@68 116 }
jpayne@68 117 ArrayList<Sketch> sketches=processInner(cris);
jpayne@68 118
jpayne@68 119 errorState|=ReadWrite.closeStream(cris);
jpayne@68 120 sketchesMade++;
jpayne@68 121 return sketches;
jpayne@68 122 }
jpayne@68 123
jpayne@68 124 /*--------------------------------------------------------------*/
jpayne@68 125 /*---------------- Inner Methods ----------------*/
jpayne@68 126 /*--------------------------------------------------------------*/
jpayne@68 127
jpayne@68 128 /** Iterate through the reads */
jpayne@68 129 ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){
jpayne@68 130 assert(heap.size()==0);
jpayne@68 131 ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8);
jpayne@68 132
jpayne@68 133 //Grab the first ListNum of reads
jpayne@68 134 ListNum<Read> ln=cris.nextList();
jpayne@68 135 //Grab the actual read list from the ListNum
jpayne@68 136 ArrayList<Read> reads=(ln!=null ? ln.list : null);
jpayne@68 137
jpayne@68 138 //As long as there is a nonempty read list...
jpayne@68 139 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 140 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
jpayne@68 141
jpayne@68 142 //Loop through each read in the list
jpayne@68 143 for(int idx=0; idx<reads.size(); idx++){
jpayne@68 144 final Read r1=reads.get(idx);
jpayne@68 145 final Read r2=r1.mate;
jpayne@68 146
jpayne@68 147 processReadPair(r1, r2);
jpayne@68 148 if(mode!=ONE_SKETCH && mode!=PER_FILE){
jpayne@68 149 if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){
jpayne@68 150 int size=heap.size();
jpayne@68 151 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
jpayne@68 152 assert(sketch.keys.length>0) : sketch.keys.length+", "+size;
jpayne@68 153 sketch.loadSSU();
jpayne@68 154 sketches.add(sketch);
jpayne@68 155 sketchesMade++;
jpayne@68 156 }
jpayne@68 157 if(heap!=null){heap.clear(false);}
jpayne@68 158 }
jpayne@68 159 }
jpayne@68 160
jpayne@68 161 //Notify the input stream that the list was used
jpayne@68 162 cris.returnList(ln);
jpayne@68 163 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
jpayne@68 164
jpayne@68 165 //Fetch a new list
jpayne@68 166 ln=cris.nextList();
jpayne@68 167 reads=(ln!=null ? ln.list : null);
jpayne@68 168 }
jpayne@68 169
jpayne@68 170 //Notify the input stream that the final list was used
jpayne@68 171 if(ln!=null){
jpayne@68 172 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
jpayne@68 173 }
jpayne@68 174
jpayne@68 175 if(mode==ONE_SKETCH || mode==PER_FILE){
jpayne@68 176 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
jpayne@68 177 sketch.loadSSU();
jpayne@68 178 sketches.add(sketch);
jpayne@68 179 sketchesMade++;
jpayne@68 180 }
jpayne@68 181 heap.clear(true);
jpayne@68 182 return sketches;
jpayne@68 183 }
jpayne@68 184
jpayne@68 185 void processReadPair(Read r1, Read r2){
jpayne@68 186 //Track the initial length for statistics
jpayne@68 187 final int initialLength1=r1.length();
jpayne@68 188 final int initialLength2=r1.mateLength();
jpayne@68 189
jpayne@68 190 //Increment counters
jpayne@68 191 readsProcessed+=r1.pairCount();
jpayne@68 192 basesProcessed+=initialLength1+initialLength2;
jpayne@68 193
jpayne@68 194 if(mode!=ONE_SKETCH && mode!=PER_FILE){
jpayne@68 195 int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize);
jpayne@68 196 if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);}
jpayne@68 197 }
jpayne@68 198
jpayne@68 199 if(tool.mergePairs && r2!=null){
jpayne@68 200 final int insert=BBMerge.findOverlapStrict(r1, r2, false);
jpayne@68 201 if(insert>0){
jpayne@68 202 heap.genomeSequences++;
jpayne@68 203 r2.reverseComplement();
jpayne@68 204 r1=r1.joinRead(insert);
jpayne@68 205 r2=null;
jpayne@68 206 }
jpayne@68 207 }
jpayne@68 208
jpayne@68 209 processRead(r1);
jpayne@68 210 if(r2!=null){processRead(r2);}
jpayne@68 211
jpayne@68 212 if(heap.name0()==null){
jpayne@68 213 heap.setName0(r1.id);
jpayne@68 214 }
jpayne@68 215
jpayne@68 216 TaxNode tn=null;
jpayne@68 217 if(heap.taxID<0 && r1.length()>800){
jpayne@68 218 if(taxtree!=null){
jpayne@68 219 try {
jpayne@68 220 tn=taxtree.parseNodeFromHeader(r1.id, true);
jpayne@68 221 } catch (Throwable e) {}
jpayne@68 222 if(tn!=null){
jpayne@68 223 heap.taxID=tn.id;
jpayne@68 224 if(heap.taxName()==null){
jpayne@68 225 heap.setTaxName(tn.name);
jpayne@68 226 }
jpayne@68 227 }
jpayne@68 228 // System.err.println("A) "+heap.taxID+r1.id);
jpayne@68 229 }else{
jpayne@68 230 heap.taxID=TaxTree.parseHeaderStatic(r1.id);
jpayne@68 231 // System.err.println("B) "+heap.taxID+r1.id);
jpayne@68 232 }
jpayne@68 233 }
jpayne@68 234 assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn;
jpayne@68 235 }
jpayne@68 236
jpayne@68 237 public void processRead(final Read r){
jpayne@68 238 if(amino){
jpayne@68 239 processReadAmino(r);
jpayne@68 240 }else if(translate){
jpayne@68 241 processReadTranslated(r);
jpayne@68 242 }else{
jpayne@68 243 processReadNucleotide(r);
jpayne@68 244 }
jpayne@68 245 }
jpayne@68 246
jpayne@68 247 public void processReadTranslated(final Read r){
jpayne@68 248 assert(!r.aminoacid());
jpayne@68 249 final ArrayList<Read> prots;
jpayne@68 250 if(sixframes){
jpayne@68 251 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
jpayne@68 252 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also
jpayne@68 253 if(orf!=null && orf.length()>=min_SSU_len){
jpayne@68 254 assert(orf.is16S());
jpayne@68 255 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
jpayne@68 256 }
jpayne@68 257 //TODO: Add 18S.
jpayne@68 258 }
jpayne@68 259 prots=TranslateSixFrames.toFrames(r, true, false, 6);
jpayne@68 260 }else{
jpayne@68 261 ArrayList<Orf> list;
jpayne@68 262 list=gCaller.callGenes(r);
jpayne@68 263 prots=CallGenes.translate(r, list);
jpayne@68 264 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
jpayne@68 265 for(Orf orf : list){
jpayne@68 266 if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){
jpayne@68 267 heap.set16S(CallGenes.fetch(orf, r).bases);
jpayne@68 268 break;
jpayne@68 269 }
jpayne@68 270 }
jpayne@68 271 }
jpayne@68 272 }
jpayne@68 273 if(prots!=null){
jpayne@68 274 for(Read p : prots){
jpayne@68 275 processReadAmino(p);
jpayne@68 276 }
jpayne@68 277 }
jpayne@68 278 }
jpayne@68 279
jpayne@68 280 void processReadNucleotide(final Read r){
jpayne@68 281 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
jpayne@68 282 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S
jpayne@68 283 if(orf!=null && orf.length()>=min_SSU_len){
jpayne@68 284 assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf;
jpayne@68 285 assert(orf.is16S());
jpayne@68 286 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
jpayne@68 287 }
jpayne@68 288 //TODO: Add 18S.
jpayne@68 289 }
jpayne@68 290
jpayne@68 291 final byte[] bases=r.bases;
jpayne@68 292 final byte[] quals=r.quality;
jpayne@68 293 final long[] baseCounts=heap.baseCounts(true);
jpayne@68 294 long kmer=0;
jpayne@68 295 long rkmer=0;
jpayne@68 296 int len=0;
jpayne@68 297 assert(!r.aminoacid());
jpayne@68 298
jpayne@68 299 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
jpayne@68 300 final long min=minHashValue;
jpayne@68 301 heap.genomeSizeBases+=r.length();
jpayne@68 302 heap.genomeSequences++;
jpayne@68 303 if(eTracker!=null){eTracker.clear();}
jpayne@68 304
jpayne@68 305 // assert(false) : minProb+", "+minQual+", "+(quals==null);
jpayne@68 306
jpayne@68 307 if(quals==null || (minProb<=0 && minQual<2)){
jpayne@68 308 // System.err.println("A");
jpayne@68 309 for(int i=0; i<bases.length; i++){
jpayne@68 310 // System.err.println("B: len="+len);
jpayne@68 311 byte b=bases[i];
jpayne@68 312 long x=AminoAcid.baseToNumber[b];
jpayne@68 313 long x2=AminoAcid.baseToComplementNumber[b];
jpayne@68 314
jpayne@68 315 kmer=((kmer<<2)|x)&mask;
jpayne@68 316 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
jpayne@68 317 if(eTracker!=null){eTracker.add(b);}
jpayne@68 318 if(x<0){
jpayne@68 319 len=0;
jpayne@68 320 rkmer=0;
jpayne@68 321 }else{
jpayne@68 322 len++;
jpayne@68 323 baseCounts[(int)x]++;
jpayne@68 324 }
jpayne@68 325
jpayne@68 326 // System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
jpayne@68 327 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
jpayne@68 328 // +len+", "+(char)b+", "+x+", "+x2+"\n");
jpayne@68 329
jpayne@68 330 if(len>=k){
jpayne@68 331 kmersProcessed++;
jpayne@68 332 heap.genomeSizeKmers++;
jpayne@68 333 // heap.probSum++; //Note really necessary for fasta data
jpayne@68 334 if(eTracker==null || eTracker.passes()){
jpayne@68 335 // System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
jpayne@68 336
jpayne@68 337 // assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) :
jpayne@68 338 // "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
jpayne@68 339 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
jpayne@68 340 // +len+", "+(char)b+", "+x+", "+x2;
jpayne@68 341
jpayne@68 342 final long hashcode=hash(kmer, rkmer);
jpayne@68 343 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
jpayne@68 344 if(hashcode>min){
jpayne@68 345 if(noBlacklist){
jpayne@68 346 heap.add(hashcode);
jpayne@68 347 }else{
jpayne@68 348 heap.checkAndAdd(hashcode);
jpayne@68 349 }
jpayne@68 350 }
jpayne@68 351 }else{
jpayne@68 352 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker);
jpayne@68 353 // assert(false);
jpayne@68 354 }
jpayne@68 355 }
jpayne@68 356 }
jpayne@68 357 }else{
jpayne@68 358 int zeroQualityKmers=0;
jpayne@68 359 int positiveQualityKmers=0;
jpayne@68 360
jpayne@68 361 float prob=1;
jpayne@68 362 for(int i=0; i<bases.length; i++){
jpayne@68 363 final byte b=bases[i];
jpayne@68 364 final long x=AminoAcid.baseToNumber[b];
jpayne@68 365 final long x2=AminoAcid.baseToComplementNumber[b];
jpayne@68 366
jpayne@68 367 kmer=((kmer<<2)|x)&mask;
jpayne@68 368 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
jpayne@68 369 if(eTracker!=null){eTracker.add(b);}
jpayne@68 370
jpayne@68 371 final byte q=quals[i];
jpayne@68 372 {//Quality-related stuff
jpayne@68 373 assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual;
jpayne@68 374 // if(x>=0){
jpayne@68 375 // if(q>0){
jpayne@68 376 // positiveQualityBases++;
jpayne@68 377 // }else{
jpayne@68 378 // zeroQualityBases++;
jpayne@68 379 // }
jpayne@68 380 // }
jpayne@68 381 prob=prob*align2.QualityTools.PROB_CORRECT[q];
jpayne@68 382 if(len>k){
jpayne@68 383 byte oldq=quals[i-k];
jpayne@68 384 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
jpayne@68 385 }
jpayne@68 386 if(x<0 || q<minQual){
jpayne@68 387 len=0;
jpayne@68 388 kmer=rkmer=0;
jpayne@68 389 prob=1;
jpayne@68 390 }else{
jpayne@68 391 len++;
jpayne@68 392 baseCounts[(int)x]++;
jpayne@68 393 }
jpayne@68 394 }
jpayne@68 395
jpayne@68 396 if(len>=k){
jpayne@68 397 kmersProcessed++;
jpayne@68 398 if(prob>=minProb){
jpayne@68 399 heap.genomeSizeKmers++;
jpayne@68 400 heap.probSum+=prob;
jpayne@68 401 if(eTracker==null || eTracker.passes()){
jpayne@68 402 final long hashcode=hash(kmer, rkmer);
jpayne@68 403 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
jpayne@68 404 if(hashcode>min){
jpayne@68 405 if(noBlacklist){
jpayne@68 406 heap.add(hashcode);
jpayne@68 407 }else{
jpayne@68 408 heap.checkAndAdd(hashcode);
jpayne@68 409 }
jpayne@68 410 }
jpayne@68 411 }else{
jpayne@68 412 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
jpayne@68 413 }
jpayne@68 414 positiveQualityKmers++;
jpayne@68 415 }else if(q<=2){
jpayne@68 416 zeroQualityKmers++;
jpayne@68 417 }
jpayne@68 418 }
jpayne@68 419
jpayne@68 420 //This version is slow but calculates depth better.
jpayne@68 421 // if(len>=k){
jpayne@68 422 // kmersProcessed++;
jpayne@68 423 // heap.genomeSizeKmers++;
jpayne@68 424 // final long hash=hash(kmer, rkmer);
jpayne@68 425 // // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
jpayne@68 426 // if(hash>min){
jpayne@68 427 // if(prob>=minProb || (!heap.setMode && heap.contains(hash))){
jpayne@68 428 // if(noBlacklist){
jpayne@68 429 // heap.add(hash);
jpayne@68 430 // }else{
jpayne@68 431 // heap.checkAndAdd(hash);
jpayne@68 432 // }
jpayne@68 433 // }
jpayne@68 434 // }
jpayne@68 435 // }
jpayne@68 436 }
jpayne@68 437 if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){
jpayne@68 438 if(looksLikePacBio(r)){
jpayne@68 439 synchronized(this){
jpayne@68 440 minProb=0;
jpayne@68 441 }
jpayne@68 442 processReadNucleotide(r);
jpayne@68 443 }
jpayne@68 444 }
jpayne@68 445 }
jpayne@68 446 // assert(false);
jpayne@68 447 }
jpayne@68 448
jpayne@68 449 boolean looksLikePacBio(Read r){
jpayne@68 450 if(r.length()<302 || r.mate!=null){return false;}
jpayne@68 451 if(r.quality==null){
jpayne@68 452 int x=Parse.parseZmw(r.id);
jpayne@68 453 return x>=0;
jpayne@68 454 }
jpayne@68 455 int positive=0;
jpayne@68 456 int zero=0;
jpayne@68 457 int ns=0;
jpayne@68 458 for(int i=0; i<r.bases.length; i++){
jpayne@68 459 byte b=r.bases[i];
jpayne@68 460 byte q=r.quality[i];
jpayne@68 461 if(b=='N'){ns++;}
jpayne@68 462 else if(q==0 || q==2){
jpayne@68 463 zero++;
jpayne@68 464 }else{
jpayne@68 465 positive++;
jpayne@68 466 }
jpayne@68 467 }
jpayne@68 468 return zero>=r.length()/2 && positive==0;
jpayne@68 469 }
jpayne@68 470
jpayne@68 471 void processReadAmino(final Read r){
jpayne@68 472 final byte[] bases=r.bases;
jpayne@68 473 long kmer=0;
jpayne@68 474 int len=0;
jpayne@68 475 assert(r.aminoacid());
jpayne@68 476
jpayne@68 477 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
jpayne@68 478 final long min=minHashValue;
jpayne@68 479 heap.genomeSizeBases+=r.length();
jpayne@68 480 heap.genomeSequences++;
jpayne@68 481
jpayne@68 482 for(int i=0; i<bases.length; i++){
jpayne@68 483 byte b=bases[i];
jpayne@68 484 long x=AminoAcid.acidToNumberNoStops[b];
jpayne@68 485 kmer=((kmer<<aminoShift)|x)&mask;
jpayne@68 486 // if(eTracker!=null){eTracker.add(b);}
jpayne@68 487
jpayne@68 488 if(x<0){len=0;}else{len++;}
jpayne@68 489 if(len>=k){
jpayne@68 490 kmersProcessed++;
jpayne@68 491 heap.genomeSizeKmers++;
jpayne@68 492 // if(eTracker==null || eTracker.passes()){
jpayne@68 493 // assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r;
jpayne@68 494 long hashcode=hash(kmer, kmer);
jpayne@68 495 if(hashcode>min){
jpayne@68 496 if(noBlacklist){
jpayne@68 497 heap.add(hashcode);
jpayne@68 498 }else{
jpayne@68 499 heap.checkAndAdd(hashcode);
jpayne@68 500 }
jpayne@68 501 }
jpayne@68 502 // }
jpayne@68 503 }
jpayne@68 504 }
jpayne@68 505 }
jpayne@68 506
jpayne@68 507 void processReadAmino_old_no_entropy(final Read r){
jpayne@68 508 final byte[] bases=r.bases;
jpayne@68 509 long kmer=0;
jpayne@68 510 int len=0;
jpayne@68 511 assert(r.aminoacid());
jpayne@68 512
jpayne@68 513 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
jpayne@68 514 final long min=minHashValue;
jpayne@68 515 heap.genomeSizeBases+=r.length();
jpayne@68 516 heap.genomeSequences++;
jpayne@68 517
jpayne@68 518 for(int i=0; i<bases.length; i++){
jpayne@68 519 byte b=bases[i];
jpayne@68 520 long x=AminoAcid.acidToNumberNoStops[b];
jpayne@68 521 kmer=((kmer<<aminoShift)|x)&mask;
jpayne@68 522 if(x<0){len=0;}else{len++;}
jpayne@68 523 if(len>=k){
jpayne@68 524 kmersProcessed++;
jpayne@68 525 heap.genomeSizeKmers++;
jpayne@68 526 long hashcode=hash(kmer, kmer);
jpayne@68 527 if(hashcode>min){
jpayne@68 528 if(noBlacklist){
jpayne@68 529 heap.add(hashcode);
jpayne@68 530 }else{
jpayne@68 531 heap.checkAndAdd(hashcode);
jpayne@68 532 }
jpayne@68 533 }
jpayne@68 534 }
jpayne@68 535 }
jpayne@68 536 }
jpayne@68 537
jpayne@68 538 public Sketch toSketch(int minCount){
jpayne@68 539 Sketch sketch=null;
jpayne@68 540 if(heap!=null && heap.size()>0){
jpayne@68 541 try {
jpayne@68 542 sketch=new Sketch(heap, false, tool.trackCounts, null, minCount);
jpayne@68 543 } catch (Throwable e) {
jpayne@68 544 // TODO Auto-generated catch block
jpayne@68 545 e.printStackTrace();
jpayne@68 546 }
jpayne@68 547 heap.clear(false);
jpayne@68 548 }
jpayne@68 549 return sketch;
jpayne@68 550 }
jpayne@68 551
jpayne@68 552 public void add(SketchMakerMini smm){
jpayne@68 553 heap.add(smm.heap);
jpayne@68 554 readsProcessed+=smm.readsProcessed;
jpayne@68 555 basesProcessed+=smm.basesProcessed;
jpayne@68 556 kmersProcessed+=smm.kmersProcessed;
jpayne@68 557 sketchesMade+=smm.sketchesMade;
jpayne@68 558 pacBioDetected|=smm.pacBioDetected;
jpayne@68 559 }
jpayne@68 560
jpayne@68 561 /** True only if this thread has completed successfully */
jpayne@68 562 boolean success=false;
jpayne@68 563
jpayne@68 564 SketchHeap heap;
jpayne@68 565
jpayne@68 566 final int aminoShift;
jpayne@68 567 final int shift;
jpayne@68 568 final int shift2;
jpayne@68 569 final long mask;
jpayne@68 570 final EntropyTracker eTracker;
jpayne@68 571 final GeneCaller gCaller;
jpayne@68 572
jpayne@68 573 public float minEntropy() {
jpayne@68 574 // TODO Auto-generated method stub
jpayne@68 575 return eTracker==null ? -1 : eTracker.cutoff();
jpayne@68 576 }
jpayne@68 577
jpayne@68 578 /*--------------------------------------------------------------*/
jpayne@68 579 /*---------------- Fields ----------------*/
jpayne@68 580 /*--------------------------------------------------------------*/
jpayne@68 581
jpayne@68 582 /** Number of reads processed */
jpayne@68 583 protected long readsProcessed=0;
jpayne@68 584 /** Number of bases processed */
jpayne@68 585 protected long basesProcessed=0;
jpayne@68 586 /** Number of bases processed */
jpayne@68 587 protected long kmersProcessed=0;
jpayne@68 588 /** Number of sketches started */
jpayne@68 589 protected long sketchesMade=0;
jpayne@68 590
jpayne@68 591 float minProb() {return minProb;}
jpayne@68 592 byte minQual() {return minQual;}
jpayne@68 593 public boolean pacBioDetected=false;
jpayne@68 594 private float minProb;
jpayne@68 595 private byte minQual;
jpayne@68 596
jpayne@68 597 /*--------------------------------------------------------------*/
jpayne@68 598 /*---------------- Final Fields ----------------*/
jpayne@68 599 /*--------------------------------------------------------------*/
jpayne@68 600
jpayne@68 601 private final SketchTool tool;
jpayne@68 602 final int mode;
jpayne@68 603
jpayne@68 604 /*--------------------------------------------------------------*/
jpayne@68 605 /*---------------- Common Fields ----------------*/
jpayne@68 606 /*--------------------------------------------------------------*/
jpayne@68 607
jpayne@68 608 /** Print status messages to this output stream */
jpayne@68 609 private PrintStream outstream=System.err;
jpayne@68 610 /** Print verbose messages */
jpayne@68 611 public static boolean verbose=false;
jpayne@68 612 /** True if an error was encountered */
jpayne@68 613 public boolean errorState=false;
jpayne@68 614
jpayne@68 615 }