Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchMakerMini.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchMakerMini.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,615 @@ +package sketch; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import dna.AminoAcid; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import jgi.BBMerge; +import jgi.TranslateSixFrames; +import prok.CallGenes; +import prok.GeneCaller; +import prok.Orf; +import prok.ProkObject; +import shared.Parse; +import shared.Tools; +import stream.ConcurrentReadInputStream; +import stream.Read; +import structures.EntropyTracker; +import structures.ListNum; +import tax.TaxNode; +import tax.TaxTree; + +/** + * Creates MinHashSketches rapidly. + * + * @author Brian Bushnell + * @date July 6, 2016 + * + */ +public class SketchMakerMini extends SketchObject { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){ + this(tool_, mode_, params.minEntropy, params.minProb, params.minQual); + } + + /** + * Constructor. + */ + public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){ + + tool=tool_; + mode=mode_; + minProb=minProb_; + minQual=minQual_; + + aminoShift=AminoAcid.AMINO_SHIFT; + if(!aminoOrTranslate()){ + shift=2*k; + shift2=shift-2; + mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32 + }else{ + shift=aminoShift*k; + shift2=shift-aminoShift; + mask=(shift>63 ? -1L : ~((-1L)<<shift)); + } + + if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){ + heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts); + }else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){ + heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))), + tool.minKeyOccuranceCount, tool.trackCounts); + }else{ + heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts); + } + + if(minEntropy_>0){ + eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true); + }else{ + eTracker=null; + } + + if(translate || processSSU){ + gCaller=CallGenes.makeGeneCaller(pgm); + }else{ + gCaller=null; + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Create read streams and process all data */ + public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){ + heap.clear(false); //123 + final String simpleName; + + final FileFormat ffin1, ffin2; + if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){ + ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true); + ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true); + }else{ + ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true); + ffin2=null; + } + + //Create a read input stream + final ConcurrentReadInputStream cris; + { + simpleName=ffin1.simpleName(); + heap.setFname(simpleName); + cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null); + if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);} + cris.start(); //Start the stream + if(verbose){outstream.println("Started cris");} + } + if(mode==ONE_SKETCH || mode==PER_FILE){ + if(heap.name0()==null){heap.setName0(simpleName);} + } + ArrayList<Sketch> sketches=processInner(cris); + + errorState|=ReadWrite.closeStream(cris); + sketchesMade++; + return sketches; + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Iterate through the reads */ + ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){ + assert(heap.size()==0); + ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8); + + //Grab the first ListNum of reads + ListNum<Read> ln=cris.nextList(); + //Grab the actual read list from the ListNum + ArrayList<Read> reads=(ln!=null ? ln.list : null); + + //As long as there is a nonempty read list... + while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning + // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access + + //Loop through each read in the list + for(int idx=0; idx<reads.size(); idx++){ + final Read r1=reads.get(idx); + final Read r2=r1.mate; + + processReadPair(r1, r2); + if(mode!=ONE_SKETCH && mode!=PER_FILE){ + if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){ + int size=heap.size(); + Sketch sketch=new Sketch(heap, false, tool.trackCounts, null); + assert(sketch.keys.length>0) : sketch.keys.length+", "+size; + sketch.loadSSU(); + sketches.add(sketch); + sketchesMade++; + } + if(heap!=null){heap.clear(false);} + } + } + + //Notify the input stream that the list was used + cris.returnList(ln); + // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access + + //Fetch a new list + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + //Notify the input stream that the final list was used + if(ln!=null){ + cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); + } + + if(mode==ONE_SKETCH || mode==PER_FILE){ + Sketch sketch=new Sketch(heap, false, tool.trackCounts, null); + sketch.loadSSU(); + sketches.add(sketch); + sketchesMade++; + } + heap.clear(true); + return sketches; + } + + void processReadPair(Read r1, Read r2){ + //Track the initial length for statistics + final int initialLength1=r1.length(); + final int initialLength2=r1.mateLength(); + + //Increment counters + readsProcessed+=r1.pairCount(); + basesProcessed+=initialLength1+initialLength2; + + if(mode!=ONE_SKETCH && mode!=PER_FILE){ + int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize); + if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);} + } + + if(tool.mergePairs && r2!=null){ + final int insert=BBMerge.findOverlapStrict(r1, r2, false); + if(insert>0){ + heap.genomeSequences++; + r2.reverseComplement(); + r1=r1.joinRead(insert); + r2=null; + } + } + + processRead(r1); + if(r2!=null){processRead(r2);} + + if(heap.name0()==null){ + heap.setName0(r1.id); + } + + TaxNode tn=null; + if(heap.taxID<0 && r1.length()>800){ + if(taxtree!=null){ + try { + tn=taxtree.parseNodeFromHeader(r1.id, true); + } catch (Throwable e) {} + if(tn!=null){ + heap.taxID=tn.id; + if(heap.taxName()==null){ + heap.setTaxName(tn.name); + } + } +// System.err.println("A) "+heap.taxID+r1.id); + }else{ + heap.taxID=TaxTree.parseHeaderStatic(r1.id); +// System.err.println("B) "+heap.taxID+r1.id); + } + } + assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn; + } + + public void processRead(final Read r){ + if(amino){ + processReadAmino(r); + }else if(translate){ + processReadTranslated(r); + }else{ + processReadNucleotide(r); + } + } + + public void processReadTranslated(final Read r){ + assert(!r.aminoacid()); + final ArrayList<Read> prots; + if(sixframes){ + if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ + Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also + if(orf!=null && orf.length()>=min_SSU_len){ + assert(orf.is16S()); + if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);} + } + //TODO: Add 18S. + } + prots=TranslateSixFrames.toFrames(r, true, false, 6); + }else{ + ArrayList<Orf> list; + list=gCaller.callGenes(r); + prots=CallGenes.translate(r, list); + if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ + for(Orf orf : list){ + if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){ + heap.set16S(CallGenes.fetch(orf, r).bases); + break; + } + } + } + } + if(prots!=null){ + for(Read p : prots){ + processReadAmino(p); + } + } + } + + void processReadNucleotide(final Read r){ + if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ + Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S + if(orf!=null && orf.length()>=min_SSU_len){ + assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf; + assert(orf.is16S()); + if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);} + } + //TODO: Add 18S. + } + + final byte[] bases=r.bases; + final byte[] quals=r.quality; + final long[] baseCounts=heap.baseCounts(true); + long kmer=0; + long rkmer=0; + int len=0; + assert(!r.aminoacid()); + + final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); + final long min=minHashValue; + heap.genomeSizeBases+=r.length(); + heap.genomeSequences++; + if(eTracker!=null){eTracker.clear();} + +// assert(false) : minProb+", "+minQual+", "+(quals==null); + + if(quals==null || (minProb<=0 && minQual<2)){ +// System.err.println("A"); + for(int i=0; i<bases.length; i++){ +// System.err.println("B: len="+len); + byte b=bases[i]; + long x=AminoAcid.baseToNumber[b]; + long x2=AminoAcid.baseToComplementNumber[b]; + + kmer=((kmer<<2)|x)&mask; + rkmer=((rkmer>>>2)|(x2<<shift2))&mask; + if(eTracker!=null){eTracker.add(b);} + if(x<0){ + len=0; + rkmer=0; + }else{ + len++; + baseCounts[(int)x]++; + } + +// System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n" +// +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n" +// +len+", "+(char)b+", "+x+", "+x2+"\n"); + + if(len>=k){ + kmersProcessed++; + heap.genomeSizeKmers++; +// heap.probSum++; //Note really necessary for fasta data + if(eTracker==null || eTracker.passes()){ +// System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()); + +// assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) : +// "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n" +// +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n" +// +len+", "+(char)b+", "+x+", "+x2; + + final long hashcode=hash(kmer, rkmer); + // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); + if(hashcode>min){ + if(noBlacklist){ + heap.add(hashcode); + }else{ + heap.checkAndAdd(hashcode); + } + } + }else{ +// System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker); +// assert(false); + } + } + } + }else{ + int zeroQualityKmers=0; + int positiveQualityKmers=0; + + float prob=1; + for(int i=0; i<bases.length; i++){ + final byte b=bases[i]; + final long x=AminoAcid.baseToNumber[b]; + final long x2=AminoAcid.baseToComplementNumber[b]; + + kmer=((kmer<<2)|x)&mask; + rkmer=((rkmer>>>2)|(x2<<shift2))&mask; + if(eTracker!=null){eTracker.add(b);} + + final byte q=quals[i]; + {//Quality-related stuff + assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual; +// if(x>=0){ +// if(q>0){ +// positiveQualityBases++; +// }else{ +// zeroQualityBases++; +// } +// } + prob=prob*align2.QualityTools.PROB_CORRECT[q]; + if(len>k){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + if(x<0 || q<minQual){ + len=0; + kmer=rkmer=0; + prob=1; + }else{ + len++; + baseCounts[(int)x]++; + } + } + + if(len>=k){ + kmersProcessed++; + if(prob>=minProb){ + heap.genomeSizeKmers++; + heap.probSum+=prob; + if(eTracker==null || eTracker.passes()){ + final long hashcode=hash(kmer, rkmer); +// System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); + if(hashcode>min){ + if(noBlacklist){ + heap.add(hashcode); + }else{ + heap.checkAndAdd(hashcode); + } + } + }else{ +// System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()); + } + positiveQualityKmers++; + }else if(q<=2){ + zeroQualityKmers++; + } + } + + //This version is slow but calculates depth better. +// if(len>=k){ +// kmersProcessed++; +// heap.genomeSizeKmers++; +// final long hash=hash(kmer, rkmer); +// // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); +// if(hash>min){ +// if(prob>=minProb || (!heap.setMode && heap.contains(hash))){ +// if(noBlacklist){ +// heap.add(hash); +// }else{ +// heap.checkAndAdd(hash); +// } +// } +// } +// } + } + if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){ + if(looksLikePacBio(r)){ + synchronized(this){ + minProb=0; + } + processReadNucleotide(r); + } + } + } +// assert(false); + } + + boolean looksLikePacBio(Read r){ + if(r.length()<302 || r.mate!=null){return false;} + if(r.quality==null){ + int x=Parse.parseZmw(r.id); + return x>=0; + } + int positive=0; + int zero=0; + int ns=0; + for(int i=0; i<r.bases.length; i++){ + byte b=r.bases[i]; + byte q=r.quality[i]; + if(b=='N'){ns++;} + else if(q==0 || q==2){ + zero++; + }else{ + positive++; + } + } + return zero>=r.length()/2 && positive==0; + } + + void processReadAmino(final Read r){ + final byte[] bases=r.bases; + long kmer=0; + int len=0; + assert(r.aminoacid()); + + final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); + final long min=minHashValue; + heap.genomeSizeBases+=r.length(); + heap.genomeSequences++; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + long x=AminoAcid.acidToNumberNoStops[b]; + kmer=((kmer<<aminoShift)|x)&mask; +// if(eTracker!=null){eTracker.add(b);} + + if(x<0){len=0;}else{len++;} + if(len>=k){ + kmersProcessed++; + heap.genomeSizeKmers++; +// if(eTracker==null || eTracker.passes()){ +// assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r; + long hashcode=hash(kmer, kmer); + if(hashcode>min){ + if(noBlacklist){ + heap.add(hashcode); + }else{ + heap.checkAndAdd(hashcode); + } + } +// } + } + } + } + + void processReadAmino_old_no_entropy(final Read r){ + final byte[] bases=r.bases; + long kmer=0; + int len=0; + assert(r.aminoacid()); + + final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); + final long min=minHashValue; + heap.genomeSizeBases+=r.length(); + heap.genomeSequences++; + + for(int i=0; i<bases.length; i++){ + byte b=bases[i]; + long x=AminoAcid.acidToNumberNoStops[b]; + kmer=((kmer<<aminoShift)|x)&mask; + if(x<0){len=0;}else{len++;} + if(len>=k){ + kmersProcessed++; + heap.genomeSizeKmers++; + long hashcode=hash(kmer, kmer); + if(hashcode>min){ + if(noBlacklist){ + heap.add(hashcode); + }else{ + heap.checkAndAdd(hashcode); + } + } + } + } + } + + public Sketch toSketch(int minCount){ + Sketch sketch=null; + if(heap!=null && heap.size()>0){ + try { + sketch=new Sketch(heap, false, tool.trackCounts, null, minCount); + } catch (Throwable e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + heap.clear(false); + } + return sketch; + } + + public void add(SketchMakerMini smm){ + heap.add(smm.heap); + readsProcessed+=smm.readsProcessed; + basesProcessed+=smm.basesProcessed; + kmersProcessed+=smm.kmersProcessed; + sketchesMade+=smm.sketchesMade; + pacBioDetected|=smm.pacBioDetected; + } + + /** True only if this thread has completed successfully */ + boolean success=false; + + SketchHeap heap; + + final int aminoShift; + final int shift; + final int shift2; + final long mask; + final EntropyTracker eTracker; + final GeneCaller gCaller; + + public float minEntropy() { + // TODO Auto-generated method stub + return eTracker==null ? -1 : eTracker.cutoff(); + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Number of reads processed */ + protected long readsProcessed=0; + /** Number of bases processed */ + protected long basesProcessed=0; + /** Number of bases processed */ + protected long kmersProcessed=0; + /** Number of sketches started */ + protected long sketchesMade=0; + + float minProb() {return minProb;} + byte minQual() {return minQual;} + public boolean pacBioDetected=false; + private float minProb; + private byte minQual; + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private final SketchTool tool; + final int mode; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Print status messages to this output stream */ + private PrintStream outstream=System.err; + /** Print verbose messages */ + public static boolean verbose=false; + /** True if an error was encountered */ + public boolean errorState=false; + +}