view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchMakerMini.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package sketch;

import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;

import dna.AminoAcid;
import fileIO.FileFormat;
import fileIO.ReadWrite;
import jgi.BBMerge;
import jgi.TranslateSixFrames;
import prok.CallGenes;
import prok.GeneCaller;
import prok.Orf;
import prok.ProkObject;
import shared.Parse;
import shared.Tools;
import stream.ConcurrentReadInputStream;
import stream.Read;
import structures.EntropyTracker;
import structures.ListNum;
import tax.TaxNode;
import tax.TaxTree;

/**
 * Creates MinHashSketches rapidly.
 * 
 * @author Brian Bushnell
 * @date July 6, 2016
 *
 */
public class SketchMakerMini extends SketchObject {
	
	/*--------------------------------------------------------------*/
	/*----------------        Initialization        ----------------*/
	/*--------------------------------------------------------------*/
	
	public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){
		this(tool_, mode_, params.minEntropy, params.minProb, params.minQual);
	}
	
	/**
	 * Constructor.
	 */
	public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){
		
		tool=tool_;
		mode=mode_;
		minProb=minProb_;
		minQual=minQual_;
		
		aminoShift=AminoAcid.AMINO_SHIFT;
		if(!aminoOrTranslate()){
			shift=2*k;
			shift2=shift-2;
			mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
		}else{
			shift=aminoShift*k;
			shift2=shift-aminoShift;
			mask=(shift>63 ? -1L : ~((-1L)<<shift));
		}
		
		if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){
			heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts);
		}else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){
			heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))), 
					tool.minKeyOccuranceCount, tool.trackCounts);
		}else{
			heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts);
		}
		
		if(minEntropy_>0){
			eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true);
		}else{
			eTracker=null;
		}
		
		if(translate || processSSU){
			gCaller=CallGenes.makeGeneCaller(pgm);
		}else{
			gCaller=null;
		}
	}
	
	/*--------------------------------------------------------------*/
	/*----------------         Outer Methods        ----------------*/
	/*--------------------------------------------------------------*/

	/** Create read streams and process all data */
	public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){
		heap.clear(false); //123
		final String simpleName;
		
		final FileFormat ffin1, ffin2;
		if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
			ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true);
			ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true);
		}else{
			ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true);
			ffin2=null;
		}
		
		//Create a read input stream
		final ConcurrentReadInputStream cris;
		{
			simpleName=ffin1.simpleName();
			heap.setFname(simpleName);
			cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
			if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
			cris.start(); //Start the stream
			if(verbose){outstream.println("Started cris");}
		}
		if(mode==ONE_SKETCH || mode==PER_FILE){
			 if(heap.name0()==null){heap.setName0(simpleName);}
		}
		ArrayList<Sketch> sketches=processInner(cris);
		
		errorState|=ReadWrite.closeStream(cris);
		sketchesMade++;
		return sketches;
	}
	
	/*--------------------------------------------------------------*/
	/*----------------         Inner Methods        ----------------*/
	/*--------------------------------------------------------------*/
	
	/** Iterate through the reads */
	ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){
		assert(heap.size()==0);
		ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8);
		
		//Grab the first ListNum of reads
		ListNum<Read> ln=cris.nextList();
		//Grab the actual read list from the ListNum
		ArrayList<Read> reads=(ln!=null ? ln.list : null);

		//As long as there is a nonempty read list...
		while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
			//				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
			
			//Loop through each read in the list
			for(int idx=0; idx<reads.size(); idx++){
				final Read r1=reads.get(idx);
				final Read r2=r1.mate;
				
				processReadPair(r1, r2);
				if(mode!=ONE_SKETCH && mode!=PER_FILE){
					if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){
						int size=heap.size();
						Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
						assert(sketch.keys.length>0) : sketch.keys.length+", "+size;
						sketch.loadSSU();
						sketches.add(sketch);
						sketchesMade++;
					}
					if(heap!=null){heap.clear(false);}
				}
			}

			//Notify the input stream that the list was used
			cris.returnList(ln);
			//				if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access

			//Fetch a new list
			ln=cris.nextList();
			reads=(ln!=null ? ln.list : null);
		}

		//Notify the input stream that the final list was used
		if(ln!=null){
			cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
		}
		
		if(mode==ONE_SKETCH || mode==PER_FILE){
			Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
			sketch.loadSSU();
			sketches.add(sketch);
			sketchesMade++;
		}
		heap.clear(true);
		return sketches;
	}

	void processReadPair(Read r1, Read r2){
		//Track the initial length for statistics
		final int initialLength1=r1.length();
		final int initialLength2=r1.mateLength();

		//Increment counters
		readsProcessed+=r1.pairCount();
		basesProcessed+=initialLength1+initialLength2;
		
		if(mode!=ONE_SKETCH && mode!=PER_FILE){
			int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize);
			if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);}
		}
		
		if(tool.mergePairs && r2!=null){
			final int insert=BBMerge.findOverlapStrict(r1, r2, false);
			if(insert>0){
				heap.genomeSequences++;
				r2.reverseComplement();
				r1=r1.joinRead(insert);
				r2=null;
			}
		}
		
		processRead(r1);
		if(r2!=null){processRead(r2);}

		if(heap.name0()==null){
			heap.setName0(r1.id);
		}
		
		TaxNode tn=null;
		if(heap.taxID<0 && r1.length()>800){
			if(taxtree!=null){ 
				try {
					tn=taxtree.parseNodeFromHeader(r1.id, true);
				} catch (Throwable e) {}
				if(tn!=null){
					heap.taxID=tn.id;
					if(heap.taxName()==null){
						heap.setTaxName(tn.name);
					}
				}
//				System.err.println("A) "+heap.taxID+r1.id);
			}else{
				heap.taxID=TaxTree.parseHeaderStatic(r1.id);
//				System.err.println("B) "+heap.taxID+r1.id);
			}
		}
		assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn;
	}

	public void processRead(final Read r){
		if(amino){
			processReadAmino(r);
		}else if(translate){
			processReadTranslated(r);
		}else{
			processReadNucleotide(r);
		}
	}
	
	public void processReadTranslated(final Read r){
		assert(!r.aminoacid());
		final ArrayList<Read> prots;
		if(sixframes){
			if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
				Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also
				if(orf!=null && orf.length()>=min_SSU_len){
					assert(orf.is16S());
					if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
				}
				//TODO: Add 18S.
			}
			prots=TranslateSixFrames.toFrames(r, true, false, 6);
		}else{
			ArrayList<Orf> list;
			list=gCaller.callGenes(r);
			prots=CallGenes.translate(r, list);
			if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
				for(Orf orf : list){
					if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){
						heap.set16S(CallGenes.fetch(orf, r).bases);
						break;
					}
				}
			}
		}
		if(prots!=null){
			for(Read p : prots){
				processReadAmino(p);
			}
		}
	}
	
	void processReadNucleotide(final Read r){
		if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
			Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S
			if(orf!=null && orf.length()>=min_SSU_len){
				assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf;
				assert(orf.is16S());
				if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
			}
			//TODO: Add 18S.
		}
		
		final byte[] bases=r.bases;
		final byte[] quals=r.quality;
		final long[] baseCounts=heap.baseCounts(true);
		long kmer=0;
		long rkmer=0;
		int len=0;
		assert(!r.aminoacid());
		
		final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
		final long min=minHashValue;
		heap.genomeSizeBases+=r.length();
		heap.genomeSequences++;
		if(eTracker!=null){eTracker.clear();}
		
//		assert(false) : minProb+", "+minQual+", "+(quals==null);
		
		if(quals==null || (minProb<=0 && minQual<2)){
//			System.err.println("A");
			for(int i=0; i<bases.length; i++){
//				System.err.println("B: len="+len);
				byte b=bases[i];
				long x=AminoAcid.baseToNumber[b];
				long x2=AminoAcid.baseToComplementNumber[b];
				
				kmer=((kmer<<2)|x)&mask;
				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
				if(eTracker!=null){eTracker.add(b);}
				if(x<0){
					len=0;
					rkmer=0;
				}else{
					len++;
					baseCounts[(int)x]++;
				}
				
//				System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
//				+AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
//				+len+", "+(char)b+", "+x+", "+x2+"\n");
				
				if(len>=k){
					kmersProcessed++;
					heap.genomeSizeKmers++;
//					heap.probSum++; //Note really necessary for fasta data
					if(eTracker==null || eTracker.passes()){
//						System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
						
//						assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) : 
//							"\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
//							+AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
//							+len+", "+(char)b+", "+x+", "+x2;
						
						final long hashcode=hash(kmer, rkmer);
						//				System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
						if(hashcode>min){
							if(noBlacklist){
								heap.add(hashcode);
							}else{
								heap.checkAndAdd(hashcode);
							}
						}
					}else{
//						System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker);
//						assert(false);
					}
				}
			}
		}else{
			int zeroQualityKmers=0;
			int positiveQualityKmers=0;
			
			float prob=1;
			for(int i=0; i<bases.length; i++){
				final byte b=bases[i];
				final long x=AminoAcid.baseToNumber[b];
				final long x2=AminoAcid.baseToComplementNumber[b];

				kmer=((kmer<<2)|x)&mask;
				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
				if(eTracker!=null){eTracker.add(b);}

				final byte q=quals[i];
				{//Quality-related stuff
					assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual;
//					if(x>=0){
//						if(q>0){
//							positiveQualityBases++;
//						}else{
//							zeroQualityBases++;
//						}
//					}
					prob=prob*align2.QualityTools.PROB_CORRECT[q];
					if(len>k){
						byte oldq=quals[i-k];
						prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
					}
					if(x<0 || q<minQual){
						len=0;
						kmer=rkmer=0;
						prob=1;
					}else{
						len++;
						baseCounts[(int)x]++;
					}
				}
				
				if(len>=k){
					kmersProcessed++;
					if(prob>=minProb){
						heap.genomeSizeKmers++;
						heap.probSum+=prob;
						if(eTracker==null || eTracker.passes()){
							final long hashcode=hash(kmer, rkmer);
//							System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
							if(hashcode>min){
								if(noBlacklist){
									heap.add(hashcode);
								}else{
									heap.checkAndAdd(hashcode);
								}
							}
						}else{
//							System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
						}
						positiveQualityKmers++;
					}else if(q<=2){
						zeroQualityKmers++;
					}
				}
				
				//This version is slow but calculates depth better.
//				if(len>=k){
//					kmersProcessed++;
//					heap.genomeSizeKmers++;
//					final long hash=hash(kmer, rkmer);
//					//				System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
//					if(hash>min){
//						if(prob>=minProb || (!heap.setMode && heap.contains(hash))){
//							if(noBlacklist){
//								heap.add(hash);
//							}else{
//								heap.checkAndAdd(hash);
//							}
//						}
//					}
//				}
			}
			if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){
				if(looksLikePacBio(r)){
					synchronized(this){
						minProb=0;
					}
					processReadNucleotide(r);
				}
			}
		}
//		assert(false);
	}
	
	boolean looksLikePacBio(Read r){
		if(r.length()<302 || r.mate!=null){return false;}
		if(r.quality==null){
			int x=Parse.parseZmw(r.id);
			return x>=0;
		}
		int positive=0;
		int zero=0;
		int ns=0;
		for(int i=0; i<r.bases.length; i++){
			byte b=r.bases[i];
			byte q=r.quality[i];
			if(b=='N'){ns++;}
			else if(q==0 || q==2){
				zero++;
			}else{
				positive++;
			}
		}
		return zero>=r.length()/2 && positive==0;
	}

	void processReadAmino(final Read r){
		final byte[] bases=r.bases;
		long kmer=0;
		int len=0;
		assert(r.aminoacid());
		
		final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
		final long min=minHashValue;
		heap.genomeSizeBases+=r.length();
		heap.genomeSequences++;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			long x=AminoAcid.acidToNumberNoStops[b];
			kmer=((kmer<<aminoShift)|x)&mask;
//			if(eTracker!=null){eTracker.add(b);}
			
			if(x<0){len=0;}else{len++;}
			if(len>=k){
				kmersProcessed++;
				heap.genomeSizeKmers++;
//				if(eTracker==null || eTracker.passes()){
//					assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r;
					long hashcode=hash(kmer, kmer);
					if(hashcode>min){
						if(noBlacklist){
							heap.add(hashcode);
						}else{
							heap.checkAndAdd(hashcode);
						}
					}
//				}
			}
		}
	}

	void processReadAmino_old_no_entropy(final Read r){
		final byte[] bases=r.bases;
		long kmer=0;
		int len=0;
		assert(r.aminoacid());
		
		final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
		final long min=minHashValue;
		heap.genomeSizeBases+=r.length();
		heap.genomeSequences++;
		
		for(int i=0; i<bases.length; i++){
			byte b=bases[i];
			long x=AminoAcid.acidToNumberNoStops[b];
			kmer=((kmer<<aminoShift)|x)&mask;
			if(x<0){len=0;}else{len++;}
			if(len>=k){
				kmersProcessed++;
				heap.genomeSizeKmers++;
				long hashcode=hash(kmer, kmer);
				if(hashcode>min){
					if(noBlacklist){
						heap.add(hashcode);
					}else{
						heap.checkAndAdd(hashcode);
					}
				}
			}
		}
	}
	
	public Sketch toSketch(int minCount){
		Sketch sketch=null;
		if(heap!=null && heap.size()>0){
			try {
				sketch=new Sketch(heap, false, tool.trackCounts, null, minCount);
			} catch (Throwable e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			heap.clear(false);
		}
		return sketch;
	}
	
	public void add(SketchMakerMini smm){
		heap.add(smm.heap);
		readsProcessed+=smm.readsProcessed;
		basesProcessed+=smm.basesProcessed;
		kmersProcessed+=smm.kmersProcessed;
		sketchesMade+=smm.sketchesMade;
		pacBioDetected|=smm.pacBioDetected;
	}

	/** True only if this thread has completed successfully */
	boolean success=false;

	SketchHeap heap;

	final int aminoShift;
	final int shift;
	final int shift2;
	final long mask;
	final EntropyTracker eTracker;
	final GeneCaller gCaller;
	
	public float minEntropy() {
		// TODO Auto-generated method stub
		return eTracker==null ? -1 : eTracker.cutoff();
	}
	
	/*--------------------------------------------------------------*/
	/*----------------            Fields            ----------------*/
	/*--------------------------------------------------------------*/

	/** Number of reads processed */
	protected long readsProcessed=0;
	/** Number of bases processed */
	protected long basesProcessed=0;
	/** Number of bases processed */
	protected long kmersProcessed=0;
	/** Number of sketches started */
	protected long sketchesMade=0;

	float minProb() {return minProb;}
	byte minQual() {return minQual;}
	public boolean pacBioDetected=false;
	private float minProb;
	private byte minQual;
	
	/*--------------------------------------------------------------*/
	/*----------------         Final Fields         ----------------*/
	/*--------------------------------------------------------------*/
	
	private final SketchTool tool;
	final int mode;
	
	/*--------------------------------------------------------------*/
	/*----------------        Common Fields         ----------------*/
	/*--------------------------------------------------------------*/
	
	/** Print status messages to this output stream */
	private PrintStream outstream=System.err;
	/** Print verbose messages */
	public static boolean verbose=false;
	/** True if an error was encountered */
	public boolean errorState=false;
	
}