jpayne@68: package sketch; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.io.PrintStream; jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.Arrays; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import fileIO.FileFormat; jpayne@68: import fileIO.ReadWrite; jpayne@68: import jgi.BBMerge; jpayne@68: import jgi.TranslateSixFrames; jpayne@68: import prok.CallGenes; jpayne@68: import prok.GeneCaller; jpayne@68: import prok.Orf; jpayne@68: import prok.ProkObject; jpayne@68: import shared.Parse; jpayne@68: import shared.Tools; jpayne@68: import stream.ConcurrentReadInputStream; jpayne@68: import stream.Read; jpayne@68: import structures.EntropyTracker; jpayne@68: import structures.ListNum; jpayne@68: import tax.TaxNode; jpayne@68: import tax.TaxTree; jpayne@68: jpayne@68: /** jpayne@68: * Creates MinHashSketches rapidly. jpayne@68: * jpayne@68: * @author Brian Bushnell jpayne@68: * @date July 6, 2016 jpayne@68: * jpayne@68: */ jpayne@68: public class SketchMakerMini extends SketchObject { jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Initialization ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){ jpayne@68: this(tool_, mode_, params.minEntropy, params.minProb, params.minQual); jpayne@68: } jpayne@68: jpayne@68: /** jpayne@68: * Constructor. jpayne@68: */ jpayne@68: public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){ jpayne@68: jpayne@68: tool=tool_; jpayne@68: mode=mode_; jpayne@68: minProb=minProb_; jpayne@68: minQual=minQual_; jpayne@68: jpayne@68: aminoShift=AminoAcid.AMINO_SHIFT; jpayne@68: if(!aminoOrTranslate()){ jpayne@68: shift=2*k; jpayne@68: shift2=shift-2; jpayne@68: mask=(shift>63 ? -1L : ~((-1L)<63 ? -1L : ~((-1L)<0){ jpayne@68: eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true); jpayne@68: }else{ jpayne@68: eTracker=null; jpayne@68: } jpayne@68: jpayne@68: if(translate || processSSU){ jpayne@68: gCaller=CallGenes.makeGeneCaller(pgm); jpayne@68: }else{ jpayne@68: gCaller=null; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Outer Methods ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Create read streams and process all data */ jpayne@68: public ArrayList toSketches(final String fname, float samplerate, long reads){ jpayne@68: heap.clear(false); //123 jpayne@68: final String simpleName; jpayne@68: jpayne@68: final FileFormat ffin1, ffin2; jpayne@68: if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){ jpayne@68: ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true); jpayne@68: ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true); jpayne@68: }else{ jpayne@68: ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true); jpayne@68: ffin2=null; jpayne@68: } jpayne@68: jpayne@68: //Create a read input stream jpayne@68: final ConcurrentReadInputStream cris; jpayne@68: { jpayne@68: simpleName=ffin1.simpleName(); jpayne@68: heap.setFname(simpleName); jpayne@68: cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null); jpayne@68: if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);} jpayne@68: cris.start(); //Start the stream jpayne@68: if(verbose){outstream.println("Started cris");} jpayne@68: } jpayne@68: if(mode==ONE_SKETCH || mode==PER_FILE){ jpayne@68: if(heap.name0()==null){heap.setName0(simpleName);} jpayne@68: } jpayne@68: ArrayList sketches=processInner(cris); jpayne@68: jpayne@68: errorState|=ReadWrite.closeStream(cris); jpayne@68: sketchesMade++; jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Inner Methods ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Iterate through the reads */ jpayne@68: ArrayList processInner(ConcurrentReadInputStream cris){ jpayne@68: assert(heap.size()==0); jpayne@68: ArrayList sketches=new ArrayList(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8); jpayne@68: jpayne@68: //Grab the first ListNum of reads jpayne@68: ListNum ln=cris.nextList(); jpayne@68: //Grab the actual read list from the ListNum jpayne@68: ArrayList reads=(ln!=null ? ln.list : null); jpayne@68: jpayne@68: //As long as there is a nonempty read list... jpayne@68: while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning jpayne@68: // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access jpayne@68: jpayne@68: //Loop through each read in the list jpayne@68: for(int idx=0; idx0 && heap.maxLen()>=Tools.max(1, minSketchSize)){ jpayne@68: int size=heap.size(); jpayne@68: Sketch sketch=new Sketch(heap, false, tool.trackCounts, null); jpayne@68: assert(sketch.keys.length>0) : sketch.keys.length+", "+size; jpayne@68: sketch.loadSSU(); jpayne@68: sketches.add(sketch); jpayne@68: sketchesMade++; jpayne@68: } jpayne@68: if(heap!=null){heap.clear(false);} jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: //Notify the input stream that the list was used jpayne@68: cris.returnList(ln); jpayne@68: // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access jpayne@68: jpayne@68: //Fetch a new list jpayne@68: ln=cris.nextList(); jpayne@68: reads=(ln!=null ? ln.list : null); jpayne@68: } jpayne@68: jpayne@68: //Notify the input stream that the final list was used jpayne@68: if(ln!=null){ jpayne@68: cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); jpayne@68: } jpayne@68: jpayne@68: if(mode==ONE_SKETCH || mode==PER_FILE){ jpayne@68: Sketch sketch=new Sketch(heap, false, tool.trackCounts, null); jpayne@68: sketch.loadSSU(); jpayne@68: sketches.add(sketch); jpayne@68: sketchesMade++; jpayne@68: } jpayne@68: heap.clear(true); jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: void processReadPair(Read r1, Read r2){ jpayne@68: //Track the initial length for statistics jpayne@68: final int initialLength1=r1.length(); jpayne@68: final int initialLength2=r1.mateLength(); jpayne@68: jpayne@68: //Increment counters jpayne@68: readsProcessed+=r1.pairCount(); jpayne@68: basesProcessed+=initialLength1+initialLength2; jpayne@68: jpayne@68: if(mode!=ONE_SKETCH && mode!=PER_FILE){ jpayne@68: int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize); jpayne@68: if(heap==null || heap.capacity()0){ jpayne@68: heap.genomeSequences++; jpayne@68: r2.reverseComplement(); jpayne@68: r1=r1.joinRead(insert); jpayne@68: r2=null; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: processRead(r1); jpayne@68: if(r2!=null){processRead(r2);} jpayne@68: jpayne@68: if(heap.name0()==null){ jpayne@68: heap.setName0(r1.id); jpayne@68: } jpayne@68: jpayne@68: TaxNode tn=null; jpayne@68: if(heap.taxID<0 && r1.length()>800){ jpayne@68: if(taxtree!=null){ jpayne@68: try { jpayne@68: tn=taxtree.parseNodeFromHeader(r1.id, true); jpayne@68: } catch (Throwable e) {} jpayne@68: if(tn!=null){ jpayne@68: heap.taxID=tn.id; jpayne@68: if(heap.taxName()==null){ jpayne@68: heap.setTaxName(tn.name); jpayne@68: } jpayne@68: } jpayne@68: // System.err.println("A) "+heap.taxID+r1.id); jpayne@68: }else{ jpayne@68: heap.taxID=TaxTree.parseHeaderStatic(r1.id); jpayne@68: // System.err.println("B) "+heap.taxID+r1.id); jpayne@68: } jpayne@68: } jpayne@68: assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn; jpayne@68: } jpayne@68: jpayne@68: public void processRead(final Read r){ jpayne@68: if(amino){ jpayne@68: processReadAmino(r); jpayne@68: }else if(translate){ jpayne@68: processReadTranslated(r); jpayne@68: }else{ jpayne@68: processReadNucleotide(r); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public void processReadTranslated(final Read r){ jpayne@68: assert(!r.aminoacid()); jpayne@68: final ArrayList prots; jpayne@68: if(sixframes){ jpayne@68: if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ jpayne@68: Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also jpayne@68: if(orf!=null && orf.length()>=min_SSU_len){ jpayne@68: assert(orf.is16S()); jpayne@68: if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);} jpayne@68: } jpayne@68: //TODO: Add 18S. jpayne@68: } jpayne@68: prots=TranslateSixFrames.toFrames(r, true, false, 6); jpayne@68: }else{ jpayne@68: ArrayList list; jpayne@68: list=gCaller.callGenes(r); jpayne@68: prots=CallGenes.translate(r, list); jpayne@68: if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ jpayne@68: for(Orf orf : list){ jpayne@68: if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){ jpayne@68: heap.set16S(CallGenes.fetch(orf, r).bases); jpayne@68: break; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(prots!=null){ jpayne@68: for(Read p : prots){ jpayne@68: processReadAmino(p); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: void processReadNucleotide(final Read r){ jpayne@68: if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){ jpayne@68: Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S jpayne@68: if(orf!=null && orf.length()>=min_SSU_len){ jpayne@68: assert(orf.start>=0 && orf.stop=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);} jpayne@68: } jpayne@68: //TODO: Add 18S. jpayne@68: } jpayne@68: jpayne@68: final byte[] bases=r.bases; jpayne@68: final byte[] quals=r.quality; jpayne@68: final long[] baseCounts=heap.baseCounts(true); jpayne@68: long kmer=0; jpayne@68: long rkmer=0; jpayne@68: int len=0; jpayne@68: assert(!r.aminoacid()); jpayne@68: jpayne@68: final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); jpayne@68: final long min=minHashValue; jpayne@68: heap.genomeSizeBases+=r.length(); jpayne@68: heap.genomeSequences++; jpayne@68: if(eTracker!=null){eTracker.clear();} jpayne@68: jpayne@68: // assert(false) : minProb+", "+minQual+", "+(quals==null); jpayne@68: jpayne@68: if(quals==null || (minProb<=0 && minQual<2)){ jpayne@68: // System.err.println("A"); jpayne@68: for(int i=0; i=k){ jpayne@68: kmersProcessed++; jpayne@68: heap.genomeSizeKmers++; jpayne@68: // heap.probSum++; //Note really necessary for fasta data jpayne@68: if(eTracker==null || eTracker.passes()){ jpayne@68: // System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()); jpayne@68: jpayne@68: // assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) : jpayne@68: // "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n" jpayne@68: // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n" jpayne@68: // +len+", "+(char)b+", "+x+", "+x2; jpayne@68: jpayne@68: final long hashcode=hash(kmer, rkmer); jpayne@68: // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); jpayne@68: if(hashcode>min){ jpayne@68: if(noBlacklist){ jpayne@68: heap.add(hashcode); jpayne@68: }else{ jpayne@68: heap.checkAndAdd(hashcode); jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker); jpayne@68: // assert(false); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: int zeroQualityKmers=0; jpayne@68: int positiveQualityKmers=0; jpayne@68: jpayne@68: float prob=1; jpayne@68: for(int i=0; i>>2)|(x2<=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual; jpayne@68: // if(x>=0){ jpayne@68: // if(q>0){ jpayne@68: // positiveQualityBases++; jpayne@68: // }else{ jpayne@68: // zeroQualityBases++; jpayne@68: // } jpayne@68: // } jpayne@68: prob=prob*align2.QualityTools.PROB_CORRECT[q]; jpayne@68: if(len>k){ jpayne@68: byte oldq=quals[i-k]; jpayne@68: prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; jpayne@68: } jpayne@68: if(x<0 || q=k){ jpayne@68: kmersProcessed++; jpayne@68: if(prob>=minProb){ jpayne@68: heap.genomeSizeKmers++; jpayne@68: heap.probSum+=prob; jpayne@68: if(eTracker==null || eTracker.passes()){ jpayne@68: final long hashcode=hash(kmer, rkmer); jpayne@68: // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); jpayne@68: if(hashcode>min){ jpayne@68: if(noBlacklist){ jpayne@68: heap.add(hashcode); jpayne@68: }else{ jpayne@68: heap.checkAndAdd(hashcode); jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()); jpayne@68: } jpayne@68: positiveQualityKmers++; jpayne@68: }else if(q<=2){ jpayne@68: zeroQualityKmers++; jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: //This version is slow but calculates depth better. jpayne@68: // if(len>=k){ jpayne@68: // kmersProcessed++; jpayne@68: // heap.genomeSizeKmers++; jpayne@68: // final long hash=hash(kmer, rkmer); jpayne@68: // // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash); jpayne@68: // if(hash>min){ jpayne@68: // if(prob>=minProb || (!heap.setMode && heap.contains(hash))){ jpayne@68: // if(noBlacklist){ jpayne@68: // heap.add(hash); jpayne@68: // }else{ jpayne@68: // heap.checkAndAdd(hash); jpayne@68: // } jpayne@68: // } jpayne@68: // } jpayne@68: // } jpayne@68: } jpayne@68: if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){ jpayne@68: if(looksLikePacBio(r)){ jpayne@68: synchronized(this){ jpayne@68: minProb=0; jpayne@68: } jpayne@68: processReadNucleotide(r); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: // assert(false); jpayne@68: } jpayne@68: jpayne@68: boolean looksLikePacBio(Read r){ jpayne@68: if(r.length()<302 || r.mate!=null){return false;} jpayne@68: if(r.quality==null){ jpayne@68: int x=Parse.parseZmw(r.id); jpayne@68: return x>=0; jpayne@68: } jpayne@68: int positive=0; jpayne@68: int zero=0; jpayne@68: int ns=0; jpayne@68: for(int i=0; i=r.length()/2 && positive==0; jpayne@68: } jpayne@68: jpayne@68: void processReadAmino(final Read r){ jpayne@68: final byte[] bases=r.bases; jpayne@68: long kmer=0; jpayne@68: int len=0; jpayne@68: assert(r.aminoacid()); jpayne@68: jpayne@68: final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); jpayne@68: final long min=minHashValue; jpayne@68: heap.genomeSizeBases+=r.length(); jpayne@68: heap.genomeSequences++; jpayne@68: jpayne@68: for(int i=0; i=k){ jpayne@68: kmersProcessed++; jpayne@68: heap.genomeSizeKmers++; jpayne@68: // if(eTracker==null || eTracker.passes()){ jpayne@68: // assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r; jpayne@68: long hashcode=hash(kmer, kmer); jpayne@68: if(hashcode>min){ jpayne@68: if(noBlacklist){ jpayne@68: heap.add(hashcode); jpayne@68: }else{ jpayne@68: heap.checkAndAdd(hashcode); jpayne@68: } jpayne@68: } jpayne@68: // } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: void processReadAmino_old_no_entropy(final Read r){ jpayne@68: final byte[] bases=r.bases; jpayne@68: long kmer=0; jpayne@68: int len=0; jpayne@68: assert(r.aminoacid()); jpayne@68: jpayne@68: final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists()); jpayne@68: final long min=minHashValue; jpayne@68: heap.genomeSizeBases+=r.length(); jpayne@68: heap.genomeSequences++; jpayne@68: jpayne@68: for(int i=0; i=k){ jpayne@68: kmersProcessed++; jpayne@68: heap.genomeSizeKmers++; jpayne@68: long hashcode=hash(kmer, kmer); jpayne@68: if(hashcode>min){ jpayne@68: if(noBlacklist){ jpayne@68: heap.add(hashcode); jpayne@68: }else{ jpayne@68: heap.checkAndAdd(hashcode); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public Sketch toSketch(int minCount){ jpayne@68: Sketch sketch=null; jpayne@68: if(heap!=null && heap.size()>0){ jpayne@68: try { jpayne@68: sketch=new Sketch(heap, false, tool.trackCounts, null, minCount); jpayne@68: } catch (Throwable e) { jpayne@68: // TODO Auto-generated catch block jpayne@68: e.printStackTrace(); jpayne@68: } jpayne@68: heap.clear(false); jpayne@68: } jpayne@68: return sketch; jpayne@68: } jpayne@68: jpayne@68: public void add(SketchMakerMini smm){ jpayne@68: heap.add(smm.heap); jpayne@68: readsProcessed+=smm.readsProcessed; jpayne@68: basesProcessed+=smm.basesProcessed; jpayne@68: kmersProcessed+=smm.kmersProcessed; jpayne@68: sketchesMade+=smm.sketchesMade; jpayne@68: pacBioDetected|=smm.pacBioDetected; jpayne@68: } jpayne@68: jpayne@68: /** True only if this thread has completed successfully */ jpayne@68: boolean success=false; jpayne@68: jpayne@68: SketchHeap heap; jpayne@68: jpayne@68: final int aminoShift; jpayne@68: final int shift; jpayne@68: final int shift2; jpayne@68: final long mask; jpayne@68: final EntropyTracker eTracker; jpayne@68: final GeneCaller gCaller; jpayne@68: jpayne@68: public float minEntropy() { jpayne@68: // TODO Auto-generated method stub jpayne@68: return eTracker==null ? -1 : eTracker.cutoff(); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Number of reads processed */ jpayne@68: protected long readsProcessed=0; jpayne@68: /** Number of bases processed */ jpayne@68: protected long basesProcessed=0; jpayne@68: /** Number of bases processed */ jpayne@68: protected long kmersProcessed=0; jpayne@68: /** Number of sketches started */ jpayne@68: protected long sketchesMade=0; jpayne@68: jpayne@68: float minProb() {return minProb;} jpayne@68: byte minQual() {return minQual;} jpayne@68: public boolean pacBioDetected=false; jpayne@68: private float minProb; jpayne@68: private byte minQual; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Final Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: private final SketchTool tool; jpayne@68: final int mode; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Common Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: /** Print status messages to this output stream */ jpayne@68: private PrintStream outstream=System.err; jpayne@68: /** Print verbose messages */ jpayne@68: public static boolean verbose=false; jpayne@68: /** True if an error was encountered */ jpayne@68: public boolean errorState=false; jpayne@68: jpayne@68: }