jpayne@68: package sketch; jpayne@68: jpayne@68: import java.io.File; jpayne@68: import java.io.IOException; jpayne@68: import java.io.InputStream; jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.Collection; jpayne@68: import java.util.concurrent.ConcurrentLinkedQueue; jpayne@68: import java.util.concurrent.atomic.AtomicInteger; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.ByteStreamWriter; jpayne@68: import fileIO.FileFormat; jpayne@68: import fileIO.ReadWrite; jpayne@68: import kmer.HashArray1D; jpayne@68: import kmer.HashForest; jpayne@68: import kmer.KmerNode; jpayne@68: import kmer.KmerTableSet; jpayne@68: import shared.KillSwitch; jpayne@68: import shared.Parse; jpayne@68: import shared.Shared; jpayne@68: import shared.Tools; jpayne@68: import stream.ConcurrentReadInputStream; jpayne@68: import stream.Read; jpayne@68: import structures.ByteBuilder; jpayne@68: import structures.IntList; jpayne@68: import structures.ListNum; jpayne@68: import structures.LongList; jpayne@68: import structures.StringNum; jpayne@68: jpayne@68: /** jpayne@68: * @author Brian Bushnell jpayne@68: * @date June 28, 2016 jpayne@68: * jpayne@68: */ jpayne@68: public final class SketchTool extends SketchObject { jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Constructor ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public SketchTool(int size_, DisplayParams params){ jpayne@68: this(size_, params.minKeyOccuranceCount, params.trackCounts(), params.mergePairs); jpayne@68: } jpayne@68: jpayne@68: public SketchTool(int size_, int minKeyOccuranceCount_, boolean trackCounts_, boolean mergePairs_){ jpayne@68: stTargetSketchSize=size_; jpayne@68: minKeyOccuranceCount=minKeyOccuranceCount_; jpayne@68: trackCounts=trackCounts_; jpayne@68: mergePairs=mergePairs_; jpayne@68: jpayne@68: assert(!aminoOrTranslate() || !rcomp) : "rcomp should be false in amino mode."; jpayne@68: assert(!aminoOrTranslate() || (k*AminoAcid.AMINO_SHIFT<64)) : "Protein sketches require 1 <= K <= "+(63/AminoAcid.AMINO_SHIFT)+"."; jpayne@68: assert(k>0 && k<=32) : "Sketches require 1 <= K <= 32."; //123 jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Methods ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public Sketch toSketch(KmerTableSet tables, boolean multithreaded){ jpayne@68: final int threads=(multithreaded ? Tools.mid(1, Shared.threads(), tables.ways()) : 1); jpayne@68: return (threads<2 ? toSketch_ST(tables) : toSketch_MT(tables, threads)); jpayne@68: } jpayne@68: jpayne@68: private Sketch toSketch_ST(KmerTableSet tables){ jpayne@68: SketchHeap heap=(stTargetSketchSize>0 ? new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts) : null); jpayne@68: LongList list=new LongList(); jpayne@68: jpayne@68: KmerTableSet kts=(KmerTableSet)tables; jpayne@68: for(int tnum=0; tnum0){ jpayne@68: toHeap(table, heap); jpayne@68: }else{ jpayne@68: toList(table, list); jpayne@68: } jpayne@68: } jpayne@68: return stTargetSketchSize>0 ? new Sketch(heap, false, trackCounts, null) : toSketch(list);//TODO: Could add counts here jpayne@68: } jpayne@68: jpayne@68: private Sketch toSketch_MT(KmerTableSet tables, final int threads){ jpayne@68: ArrayList alst=new ArrayList(threads); jpayne@68: AtomicInteger ai=new AtomicInteger(0); jpayne@68: for(int i=0; i heaps=new ArrayList(threads); jpayne@68: LongList list=new LongList(); jpayne@68: jpayne@68: for(SketchThread pt : alst){ jpayne@68: jpayne@68: //Wait until this thread has terminated jpayne@68: while(pt.getState()!=Thread.State.TERMINATED){ jpayne@68: try { jpayne@68: //Attempt a join operation jpayne@68: pt.join(); jpayne@68: } catch (InterruptedException e) { jpayne@68: //Potentially handle this, if it is expected to occur jpayne@68: e.printStackTrace(); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(stTargetSketchSize>=0){ jpayne@68: if(pt.heap!=null && pt.heap.size()>0){ jpayne@68: heaps.add(pt.heap); jpayne@68: } jpayne@68: }else{ jpayne@68: if(pt.list!=null){list.append(pt.list);} jpayne@68: pt.list=null; jpayne@68: } jpayne@68: } jpayne@68: alst.clear(); jpayne@68: jpayne@68: return stTargetSketchSize>=0 ? toSketch(heaps, true) : toSketch(list);//TODO: Could add counts here jpayne@68: } jpayne@68: jpayne@68: public SketchHeap toHeap(HashArray1D table, SketchHeap heap){ jpayne@68: // if(heap==null){heap=new LongHeap(size, true);} jpayne@68: long[] kmers=table.array(); jpayne@68: int[] counts=table.values(); jpayne@68: for(int i=0; i=minKeyOccuranceCount){ jpayne@68: heap.genomeSizeKmers++; jpayne@68: long kmer=kmers[i]; jpayne@68: long hashcode=hash(kmer); jpayne@68: if(hashcode>=minHashValue){ jpayne@68: heap.add(hashcode); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: HashForest forest=table.victims(); jpayne@68: if(forest!=null){ jpayne@68: for(KmerNode kn : forest.array()){ jpayne@68: if(kn!=null){addRecursive(heap, kn);} jpayne@68: } jpayne@68: } jpayne@68: return heap; jpayne@68: } jpayne@68: jpayne@68: public LongList toList(HashArray1D table, LongList list){ jpayne@68: // if(heap==null){heap=new LongHeap(size, true);} jpayne@68: long[] kmers=table.array(); jpayne@68: int[] counts=table.values(); jpayne@68: for(int i=0; i=minKeyOccuranceCount){ jpayne@68: long kmer=kmers[i]; jpayne@68: long hashcode=hash(kmer); jpayne@68: if(hashcode>=minHashValue){ jpayne@68: list.add(hashcode); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: HashForest forest=table.victims(); jpayne@68: if(forest!=null){ jpayne@68: for(KmerNode kn : forest.array()){ jpayne@68: if(kn!=null){addRecursive(list, kn);} jpayne@68: } jpayne@68: } jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: // public long[] toSketchArray(ArrayList heaps){ jpayne@68: // if(heaps.size()==1){return toSketchArray(heaps.get(0));} jpayne@68: // LongList list=new LongList(size); jpayne@68: // for(LongHeap heap : heaps){ jpayne@68: // while(heap.size()>0){list.add(Long.MAX_VALUE-heap.poll());} jpayne@68: // } jpayne@68: // list.sort(); jpayne@68: // list.shrinkToUnique(); jpayne@68: // list.size=Tools.min(size, list.size); jpayne@68: // return list.toArray(); jpayne@68: // } jpayne@68: jpayne@68: public Sketch toSketch(ArrayList heaps, boolean allowZeroSizeSketch){ jpayne@68: if(heaps==null || heaps.isEmpty()){ jpayne@68: if(allowZeroSizeSketch){ jpayne@68: return new Sketch(new long[0], null, null, null, null, null); jpayne@68: }else{ jpayne@68: return null; jpayne@68: } jpayne@68: } jpayne@68: SketchHeap a=heaps.get(0); jpayne@68: for(int i=1; i=minHashValue) : list.size+", "+list.get(list.size()-1)+", "+minHashValue; jpayne@68: list.shrinkToUnique(); jpayne@68: list.reverse(); jpayne@68: for(int i=0; i=minKeyOccuranceCount){ jpayne@68: heap.genomeSizeKmers++; jpayne@68: long kmer=kn.pivot(); jpayne@68: long hashcode=hash(kmer); jpayne@68: if(hashcode>=minHashValue){heap.add(hashcode);} jpayne@68: } jpayne@68: if(kn.left()!=null){addRecursive(heap, kn.left());} jpayne@68: if(kn.right()!=null){addRecursive(heap, kn.right());} jpayne@68: } jpayne@68: jpayne@68: private void addRecursive(LongList list, KmerNode kn){ jpayne@68: if(kn==null){return;} jpayne@68: if(kn.count()>=minKeyOccuranceCount){ jpayne@68: long kmer=kn.pivot(); jpayne@68: long hashcode=hash(kmer); jpayne@68: if(hashcode>=minHashValue){list.add(hashcode);} jpayne@68: } jpayne@68: if(kn.left()!=null){addRecursive(list, kn.left());} jpayne@68: if(kn.right()!=null){addRecursive(list, kn.right());} jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- I/O ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: // public static ArrayList loadSketches_ST(String...fnames){ jpayne@68: // ArrayList sketches=null; jpayne@68: // for(String s : fnames){ jpayne@68: // ArrayList temp; jpayne@68: // if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){ jpayne@68: // temp=loadSketches(s); jpayne@68: // }else{ jpayne@68: // temp=loadSketches_ST(s.split(",")); jpayne@68: // } jpayne@68: // if(sketches==null){sketches=temp;} jpayne@68: // else{sketches.addAll(temp);} jpayne@68: // } jpayne@68: // return sketches; jpayne@68: // } jpayne@68: jpayne@68: // public static ArrayList loadSketches_MT(ArrayList fnames){ jpayne@68: // return loadSketches_MT(0, null, fnames.toArray(new String[0])); jpayne@68: // } jpayne@68: jpayne@68: public ArrayList loadSketches_MT(DisplayParams params, Collection fnames){ jpayne@68: return loadSketches_MT(params.mode, params.samplerate, params.maxReads, jpayne@68: params.minEntropy, params.minProb, params.minQual, fnames); jpayne@68: } jpayne@68: jpayne@68: public ArrayList loadSketches_MT(DisplayParams params, String...fnames){ jpayne@68: return loadSketches_MT(params.mode, params.samplerate, params.maxReads, jpayne@68: params.minEntropy, params.minProb, params.minQual, fnames); jpayne@68: } jpayne@68: jpayne@68: public ArrayList loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, Collection fnames){ jpayne@68: return loadSketches_MT(mode, samplerate, reads, minEntropy, minProb, minQual, fnames.toArray(new String[0])); jpayne@68: } jpayne@68: jpayne@68: //TODO: This is only multithreaded per file in persequence mode. jpayne@68: public ArrayList loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, String...fnames){ jpayne@68: jpayne@68: ConcurrentLinkedQueue decomposedFnames=new ConcurrentLinkedQueue(); jpayne@68: int num=0; jpayne@68: for(String s : fnames){ jpayne@68: if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){ jpayne@68: num++; jpayne@68: decomposedFnames.add(new StringNum(s, num)); jpayne@68: }else{ jpayne@68: for(String s2 : s.split(",")){ jpayne@68: num++; jpayne@68: decomposedFnames.add(new StringNum(s2, num)); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(decomposedFnames.size()==0){return null;} jpayne@68: if(decomposedFnames.size()==1){return loadSketchesFromFile(decomposedFnames.poll().s, null, 0, reads, mode, samplerate, minEntropy, minProb, minQual, false);} jpayne@68: jpayne@68: //Determine how many threads may be used jpayne@68: final int threads=Tools.min(Shared.threads(), decomposedFnames.size()); jpayne@68: jpayne@68: //Fill a list with LoadThreads jpayne@68: ArrayList allt=new ArrayList(threads); jpayne@68: jpayne@68: for(int i=0; i sketches=new ArrayList(); jpayne@68: jpayne@68: //Start the threads jpayne@68: for(LoadThread lt : allt){lt.start();} jpayne@68: jpayne@68: //Wait for completion of all threads jpayne@68: boolean success=true; jpayne@68: for(LoadThread lt : allt){ jpayne@68: jpayne@68: //Wait until this thread has terminated jpayne@68: while(lt.getState()!=Thread.State.TERMINATED){ jpayne@68: try { jpayne@68: //Attempt a join operation jpayne@68: lt.join(); jpayne@68: } catch (InterruptedException e) { jpayne@68: //Potentially handle this, if it is expected to occur jpayne@68: e.printStackTrace(); jpayne@68: } jpayne@68: } jpayne@68: sketches.addAll(lt.list); jpayne@68: success&=lt.success; jpayne@68: } jpayne@68: assert(success) : "Failure loading some files."; jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public ArrayList loadSketchesFromFile(final String fname0, SketchMakerMini smm, jpayne@68: int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){ jpayne@68: return loadSketchesFromFile(fname0, smm, maxThreads, reads, mode, jpayne@68: params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch); jpayne@68: } jpayne@68: jpayne@68: public ArrayList loadSketchesFromFile(final String fname0, SketchMakerMini smm, jpayne@68: int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ jpayne@68: assert(fname0!=null);//123 jpayne@68: if(fname0==null){return null;} jpayne@68: FileFormat ff=FileFormat.testInput(fname0, FileFormat.FASTA, null, false, true); jpayne@68: if(ff.isSequence()){ jpayne@68: return loadSketchesFromSequenceFile(ff, smm, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); jpayne@68: }else{ jpayne@68: return SketchObject.LOADER2 ? loadSketchesFromSketchFile2(ff, allowZeroSizeSketch) : loadSketchesFromSketchFile(ff, allowZeroSizeSketch); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: private ArrayList loadSketchesFromSequenceFile(final FileFormat ff, SketchMakerMini smm, jpayne@68: int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ jpayne@68: maxThreads=(maxThreads<1 ? Shared.threads() : Tools.min(maxThreads, Shared.threads())); jpayne@68: jpayne@68: // assert(false) : (ff.fasta() || ff.fastq() || ff.samOrBam())+", "+ff.fastq()+", "+maxThreads+", "+ jpayne@68: // allowMultithreadedFastq+", "+forceDisableMultithreadedFastq+", "+(mode==ONE_SKETCH); jpayne@68: jpayne@68: if(ff.fastq() && allowMultithreadedFastq && !forceDisableMultithreadedFastq && (mode==ONE_SKETCH || mode==PER_FILE) && jpayne@68: maxThreads>1 && Shared.threads()>2 && (reads<1 || reads*samplerate*(mergePairs ? 2 : 1)>=1000)){//limit is low due to PacBio long reads jpayne@68: jpayne@68: final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; jpayne@68: Read.VALIDATE_IN_CONSTRUCTOR=false; jpayne@68: jpayne@68: if(verbose2){System.err.println("Loading a sketch multithreaded.");} jpayne@68: Sketch sketch=processReadsMT(ff.name(), maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); jpayne@68: jpayne@68: Read.VALIDATE_IN_CONSTRUCTOR=vic; jpayne@68: jpayne@68: ArrayList list=new ArrayList(1); jpayne@68: if(sketch!=null && (sketch.length()>0 || allowZeroSizeSketch)){ jpayne@68: sketch.loadSSU(); jpayne@68: list.add(sketch); jpayne@68: } jpayne@68: return list; jpayne@68: } jpayne@68: if(smm==null){smm=new SketchMakerMini(this, mode, minEntropy, minProb, minQual);} jpayne@68: if(verbose2){System.err.println("Loading sketches via SMM.");} jpayne@68: ArrayList sketches=smm.toSketches(ff.name(), samplerate, reads); jpayne@68: if(verbose2){System.err.println("Loaded "+(sketches==null ? 0 : sketches.size())+" sketches via SMM.");} jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: private ArrayList loadSketchesFromSketchFile(final FileFormat ff, boolean allowZeroSizeSketch){ jpayne@68: jpayne@68: boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false; jpayne@68: jpayne@68: if(verbose2){System.err.println("Loading sketches from text.");} jpayne@68: ArrayList sketches=new ArrayList(); jpayne@68: ByteFile bf=ByteFile.makeByteFile(ff); jpayne@68: int currentSketchSize=stTargetSketchSize; jpayne@68: int taxID=-1; jpayne@68: long spid=-1; jpayne@68: long imgID=-1; jpayne@68: long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0; jpayne@68: long[] baseCounts=null; jpayne@68: byte[] r16S=null; jpayne@68: byte[] r18S=null; jpayne@68: int r16SLen=0; jpayne@68: int r18SLen=0; jpayne@68: float probCorrect=-1; jpayne@68: String name=null, name0=null, fname=ff.simpleName(); jpayne@68: LongList list=null; jpayne@68: IntList countList=null; jpayne@68: ArrayList meta=null; jpayne@68: long sum=0; jpayne@68: byte[] lastHeader=null; jpayne@68: jpayne@68: for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ jpayne@68: if(line.length>0){ jpayne@68: // System.err.println("Processing line "+new String(line)); jpayne@68: if(line[0]=='#'){ jpayne@68: if(r16SLen>0 || r18SLen>0){ jpayne@68: byte[] ssu=KillSwitch.copyOfRange(line, 5, line.length); jpayne@68: if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){ jpayne@68: assert(r16SLen>0); jpayne@68: assert(ssu.length==r16SLen) : r16SLen+", "+line.length+"\n"+new String(line)+"\n"; jpayne@68: r16S=ssu; jpayne@68: r16SLen=0; jpayne@68: }else if(Tools.startsWith(line, "#18S:")){ jpayne@68: assert(r18SLen>0); jpayne@68: assert(ssu.length==r18SLen) : r18SLen+", "+line.length+"\n"+new String(line)+"\n"; jpayne@68: r18S=ssu; jpayne@68: r18SLen=0; jpayne@68: }else{ jpayne@68: assert(false) : new String(line); jpayne@68: } jpayne@68: }else{ jpayne@68: lastHeader=line; jpayne@68: if(list!=null){ jpayne@68: assert(list.size==list.array.length); jpayne@68: if(NUC || unsorted){ jpayne@68: list.sort(); jpayne@68: list.shrinkToUnique(); jpayne@68: }else{ jpayne@68: list.shrink(); jpayne@68: } jpayne@68: if(list.size()>0 || allowZeroSizeSketch){ jpayne@68: int[] keyCounts=countList==null ? null : countList.array; jpayne@68: Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, jpayne@68: genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); jpayne@68: sketch.spid=spid; jpayne@68: sketches.add(sketch); jpayne@68: } jpayne@68: // System.err.println("Made sketch "+sketch); jpayne@68: } jpayne@68: name=name0=null; jpayne@68: fname=ff.simpleName(); jpayne@68: list=null; jpayne@68: countList=null; jpayne@68: meta=null; jpayne@68: baseCounts=null; jpayne@68: r16S=null; jpayne@68: r18S=null; jpayne@68: r16SLen=0; jpayne@68: r18SLen=0; jpayne@68: sum=0; jpayne@68: taxID=-1; jpayne@68: imgID=-1; jpayne@68: genomeSizeBases=0; jpayne@68: genomeSizeKmers=0; jpayne@68: genomeSequences=0; jpayne@68: probCorrect=-1; jpayne@68: int k_sketch=defaultK; jpayne@68: int k2_sketch=0; jpayne@68: int hashVersion_sketch=1; jpayne@68: jpayne@68: if(line.length>1){ jpayne@68: String[] split=new String(line, 1, line.length-1).split("\t"); jpayne@68: for(String s : split){ jpayne@68: final int colon=s.indexOf(':'); jpayne@68: final String sub=s.substring(colon+1); jpayne@68: if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length jpayne@68: currentSketchSize=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("CD:")){//Coding jpayne@68: A48=HEX=NUC=delta=counts=unsorted=false; jpayne@68: jpayne@68: for(int i=0; i=0){ jpayne@68: String[] subsplit=sub.split(","); jpayne@68: assert(subsplit.length==2) : "Bad header component "+s; jpayne@68: int x=Integer.parseInt(subsplit[0]); jpayne@68: int y=Integer.parseInt(subsplit[1]); jpayne@68: k_sketch=Tools.max(x, y); jpayne@68: k2_sketch=Tools.min(x, y); jpayne@68: }else{ jpayne@68: k_sketch=Integer.parseInt(sub); jpayne@68: k2_sketch=0; jpayne@68: } jpayne@68: }else if(s.startsWith("H:")){//Hash version jpayne@68: hashVersion_sketch=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN jpayne@68: baseCounts=Parse.parseLongArray(sub); jpayne@68: }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases jpayne@68: genomeSizeBases=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers jpayne@68: genomeSizeKmers=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GQ:")){ jpayne@68: genomeSequences=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GE:")){//Genome size estimate kmers jpayne@68: //ignore jpayne@68: }else if(s.startsWith("PC:")){//Probability of correctness jpayne@68: probCorrect=Float.parseFloat(sub); jpayne@68: }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){ jpayne@68: taxID=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("IMG:")){ jpayne@68: imgID=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("SPID:")){ jpayne@68: spid=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("NM:") || s.startsWith("NAME:")){ jpayne@68: name=sub; jpayne@68: }else if(s.startsWith("FN:")){ jpayne@68: fname=sub; jpayne@68: }else if(s.startsWith("NM0:")){ jpayne@68: name0=sub; jpayne@68: }else if(s.startsWith("MT_")){ jpayne@68: if(meta==null){meta=new ArrayList(1);} jpayne@68: meta.add(s.substring(3)); jpayne@68: }else if(s.startsWith("16S:") || s.startsWith("SSU:")){ jpayne@68: r16SLen=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("18S:")){ jpayne@68: r18SLen=Integer.parseInt(sub); jpayne@68: }else{ jpayne@68: assert(false) : "Unsupported header tag "+s; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(KILL_OK){ jpayne@68: if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+ jpayne@68: " differs from loaded kmer length "+k+"\n"+new String(line)+"\nfile: "+ff.name());} jpayne@68: if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+ jpayne@68: " differs from loaded kmer length "+k+","+k2+"\n"+new String(line)+"\nfile: "+ff.name());} jpayne@68: if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+ jpayne@68: " differs from loaded hash version "+HASH_VERSION+".\n" jpayne@68: + "You may need to download the latest version of BBTools.\n"+new String(line)+"\nfile: "+ff.name());} jpayne@68: }else{//Potential hang jpayne@68: assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line); jpayne@68: assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line); jpayne@68: assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+ jpayne@68: " differs from loaded hash version "+HASH_VERSION+".\n" jpayne@68: + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n"; jpayne@68: } jpayne@68: if(currentSketchSize>0 || allowZeroSizeSketch){ jpayne@68: list=new LongList(Tools.max(1, currentSketchSize)); jpayne@68: if(counts){countList=new IntList(Tools.max(1, currentSketchSize));} jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) : jpayne@68: HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Parse.parseLong(line)); jpayne@68: // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x)); jpayne@68: sum+=x; jpayne@68: assert(x>=0 || NUC) : "x="+x+"\nline="+new String(line)+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\nlineNum="+bf.lineNum()+"\n"; jpayne@68: assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it."; jpayne@68: assert(list!=null) : new String(line); jpayne@68: long key=(delta ? sum : x); jpayne@68: if(key>=0){list.add(key);} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(list!=null && (list.size>0 || allowZeroSizeSketch)){ jpayne@68: assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : list.size+"!="+list.array.length; jpayne@68: if(NUC || unsorted){ jpayne@68: list.sort(); jpayne@68: list.shrinkToUnique(); jpayne@68: }else{ jpayne@68: list.shrink(); jpayne@68: } jpayne@68: int[] keyCounts=countList==null ? null : countList.array; jpayne@68: Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, jpayne@68: genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); jpayne@68: sketch.spid=spid; jpayne@68: sketches.add(sketch); jpayne@68: } jpayne@68: if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");} jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: /** Usually much faster due to not manifesting the multithreaded load Java slowdown. Should incur less garbage collection also. */ jpayne@68: private ArrayList loadSketchesFromSketchFile2(final FileFormat ff, boolean allowZeroSizeSketch){ jpayne@68: jpayne@68: boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false; jpayne@68: jpayne@68: if(verbose2){System.err.println("Loading sketches from text.");} jpayne@68: ArrayList sketches=new ArrayList(); jpayne@68: jpayne@68: InputStream is=ReadWrite.getInputStream(ff.name(), BUFFERED_READER, false); jpayne@68: byte[] buffer=new byte[BUFLEN]; jpayne@68: int start, limit=0; jpayne@68: try { jpayne@68: limit=is.read(buffer); jpayne@68: } catch (IOException e) { jpayne@68: KillSwitch.exceptionKill(e); jpayne@68: } jpayne@68: jpayne@68: jpayne@68: int currentSketchSize=stTargetSketchSize; jpayne@68: int taxID=-1; jpayne@68: long spid=-1; jpayne@68: long imgID=-1; jpayne@68: long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0; jpayne@68: long[] baseCounts=null; jpayne@68: byte[] r16S=null; jpayne@68: byte[] r18S=null; jpayne@68: int r16SLen=0; jpayne@68: int r18SLen=0; jpayne@68: float probCorrect=-1; jpayne@68: String name=null, name0=null, fname=ff.simpleName(); jpayne@68: LongList list=null; jpayne@68: IntList countList=null; jpayne@68: ArrayList meta=null; jpayne@68: long sum=0; jpayne@68: byte[] lastHeader=null; jpayne@68: ByteBuilder bb=new ByteBuilder(256); jpayne@68: jpayne@68: for(start=0; start=limit){start=0; limit=is.read(buffer);} jpayne@68: } jpayne@68: } catch (IOException e) { jpayne@68: KillSwitch.exceptionKill(e); jpayne@68: } jpayne@68: start++; jpayne@68: jpayne@68: if(r16SLen>0 || r18SLen>0){ jpayne@68: byte[] ssu=bb.toBytes(5, bb.length()); jpayne@68: if(bb.startsWith("#16S:") || bb.startsWith("#SSU:")){ jpayne@68: assert(r16SLen>0); jpayne@68: assert(ssu.length==r16SLen) : r16SLen+", "+bb.length+"\n"+bb+"\n"; jpayne@68: r16S=ssu; jpayne@68: r16SLen=0; jpayne@68: }else if(bb.startsWith("#18S:")){ jpayne@68: assert(r18SLen>0); jpayne@68: assert(ssu.length==r18SLen) : r18SLen+", "+bb.length+"\n"+bb+"\n"; jpayne@68: r18S=ssu; jpayne@68: r18SLen=0; jpayne@68: }else{ jpayne@68: assert(false) : bb; jpayne@68: } jpayne@68: }else{ jpayne@68: jpayne@68: // byte[] line=lastHeader=bb.toBytes(); jpayne@68: if(list!=null){ jpayne@68: jpayne@68: //This assertion fails sometimes for Silva per-sequence mode, but it's not important jpayne@68: // assert(list.size==list.array.length) : list.size+", "+list.array.length+(lastHeader==null ? "" : ", "+new String(lastHeader)); jpayne@68: jpayne@68: if(NUC || unsorted){ jpayne@68: list.sort(); jpayne@68: list.shrinkToUnique(); jpayne@68: }else{ jpayne@68: list.shrink(); jpayne@68: } jpayne@68: if(list.size()>0 || allowZeroSizeSketch){ jpayne@68: int[] keyCounts=countList==null ? null : countList.array; jpayne@68: Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, jpayne@68: genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); jpayne@68: sketch.spid=spid; jpayne@68: sketches.add(sketch); jpayne@68: } jpayne@68: // System.err.println("Made sketch "+sketch); jpayne@68: } jpayne@68: name=name0=null; jpayne@68: fname=ff.simpleName(); jpayne@68: list=null; jpayne@68: countList=null; jpayne@68: meta=null; jpayne@68: baseCounts=null; jpayne@68: r16S=null; jpayne@68: r18S=null; jpayne@68: r16SLen=0; jpayne@68: r18SLen=0; jpayne@68: sum=0; jpayne@68: taxID=-1; jpayne@68: imgID=-1; jpayne@68: genomeSizeBases=0; jpayne@68: genomeSizeKmers=0; jpayne@68: genomeSequences=0; jpayne@68: probCorrect=-1; jpayne@68: int k_sketch=defaultK; jpayne@68: int k2_sketch=0; jpayne@68: int hashVersion_sketch=1; jpayne@68: jpayne@68: if(bb.length>1){ jpayne@68: String[] split=new String(bb.array, 1, bb.length-1).split("\t"); jpayne@68: for(String s : split){ jpayne@68: final int colon=s.indexOf(':'); jpayne@68: final String sub=s.substring(colon+1); jpayne@68: if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length jpayne@68: currentSketchSize=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("CD:")){//Coding jpayne@68: A48=HEX=NUC=delta=counts=unsorted=false; jpayne@68: jpayne@68: for(int i=0; i=0){ jpayne@68: String[] subsplit=sub.split(","); jpayne@68: assert(subsplit.length==2) : "Bad header component "+s; jpayne@68: int x=Integer.parseInt(subsplit[0]); jpayne@68: int y=Integer.parseInt(subsplit[1]); jpayne@68: k_sketch=Tools.max(x, y); jpayne@68: k2_sketch=Tools.min(x, y); jpayne@68: }else{ jpayne@68: k_sketch=Integer.parseInt(sub); jpayne@68: k2_sketch=0; jpayne@68: } jpayne@68: }else if(s.startsWith("H:")){//Hash version jpayne@68: hashVersion_sketch=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN jpayne@68: baseCounts=Parse.parseLongArray(sub); jpayne@68: }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases jpayne@68: genomeSizeBases=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers jpayne@68: genomeSizeKmers=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GQ:")){ jpayne@68: genomeSequences=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GE:")){//Genome size estimate kmers jpayne@68: //ignore jpayne@68: }else if(s.startsWith("PC:")){//Probability of correctness jpayne@68: probCorrect=Float.parseFloat(sub); jpayne@68: }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){ jpayne@68: taxID=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("IMG:")){ jpayne@68: imgID=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("SPID:")){ jpayne@68: spid=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("NM:") || s.startsWith("NAME:")){ jpayne@68: name=sub; jpayne@68: }else if(s.startsWith("FN:")){ jpayne@68: fname=sub; jpayne@68: }else if(s.startsWith("NM0:")){ jpayne@68: name0=sub; jpayne@68: }else if(s.startsWith("MT_")){ jpayne@68: if(meta==null){meta=new ArrayList(1);} jpayne@68: meta.add(s.substring(3)); jpayne@68: }else if(s.startsWith("16S:") || s.startsWith("SSU:")){ jpayne@68: r16SLen=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("18S:")){ jpayne@68: r18SLen=Integer.parseInt(sub); jpayne@68: }else{ jpayne@68: assert(false) : "Unsupported header tag "+s; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(KILL_OK){ jpayne@68: if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+ jpayne@68: " differs from loaded kmer length "+k+"\n"+bb+"\nfile: "+ff.name());} jpayne@68: if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+ jpayne@68: " differs from loaded kmer length "+k+","+k2+"\n"+bb+"\nfile: "+ff.name());} jpayne@68: if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+ jpayne@68: " differs from loaded hash version "+HASH_VERSION+".\n" jpayne@68: + "You may need to download the latest version of BBTools.\nfile: "+ff.name());} jpayne@68: }else{//Potential hang jpayne@68: assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+bb; jpayne@68: assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+bb; jpayne@68: assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+ jpayne@68: " differs from loaded hash version "+HASH_VERSION+".\n" jpayne@68: + "You may need to download the latest version of BBTools.\nfile: "+ff.name(); jpayne@68: } jpayne@68: if(currentSketchSize>0 || allowZeroSizeSketch){ jpayne@68: list=new LongList(Tools.max(1, currentSketchSize)); jpayne@68: if(counts){countList=new IntList(Tools.max(1, currentSketchSize));} jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: bb.clear(); jpayne@68: try { jpayne@68: while(buffer[start]!='\n'){ jpayne@68: bb.append(buffer[start]); jpayne@68: start++; jpayne@68: if(start>=limit){start=0; limit=is.read(buffer);} jpayne@68: } jpayne@68: } catch (IOException e) { jpayne@68: KillSwitch.exceptionKill(e); jpayne@68: } jpayne@68: start++; jpayne@68: jpayne@68: long x=(counts ? Sketch.parseA48C(bb, countList) : A48 ? Sketch.parseA48(bb) : jpayne@68: HEX ? Sketch.parseHex(bb) : NUC ? Sketch.parseNuc(bb) : Parse.parseLong(bb.array, 0, bb.length)); jpayne@68: // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x)); jpayne@68: sum+=x; jpayne@68: assert(x>=0 || NUC) : "x="+x+"\nline="+bb+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\n"; jpayne@68: assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it."; jpayne@68: assert(list!=null) : bb; jpayne@68: long key=(delta ? sum : x); jpayne@68: if(key>=0){list.add(key);} jpayne@68: } jpayne@68: jpayne@68: if(start>=limit){ jpayne@68: start=0; jpayne@68: try { jpayne@68: limit=is.read(buffer); jpayne@68: } catch (IOException e) { jpayne@68: KillSwitch.exceptionKill(e); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(list!=null && (list.size>0 || allowZeroSizeSketch)){ jpayne@68: jpayne@68: //This assertion fails sometimes for Silva per-sequence mode, but it's not important jpayne@68: // assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : jpayne@68: // list.size+"!="+list.array.length+(lastHeader==null ? "" : "\n"+new String(lastHeader)); jpayne@68: jpayne@68: if(NUC || unsorted){ jpayne@68: list.sort(); jpayne@68: list.shrinkToUnique(); jpayne@68: }else{ jpayne@68: list.shrink(); jpayne@68: } jpayne@68: int[] keyCounts=countList==null ? null : countList.array; jpayne@68: Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, jpayne@68: genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); jpayne@68: sketch.spid=spid; jpayne@68: sketches.add(sketch); jpayne@68: } jpayne@68: if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");} jpayne@68: try { jpayne@68: is.close(); jpayne@68: } catch (IOException e) { jpayne@68: e.printStackTrace(); jpayne@68: } jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public ArrayList loadSketchesFromString(String sketchString){ jpayne@68: boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false; jpayne@68: jpayne@68: ArrayList sketches=new ArrayList(); jpayne@68: int currentSketchSize=stTargetSketchSize; jpayne@68: int taxID=-1; jpayne@68: long spid=-1; jpayne@68: long imgID=-1; jpayne@68: long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0; jpayne@68: long[] baseCounts=null; jpayne@68: byte[] r16S=null; jpayne@68: byte[] r18S=null; jpayne@68: int r16SLen=0; jpayne@68: int r18SLen=0; jpayne@68: float probCorrect=-1; jpayne@68: String name=null, name0=null, fname=null; jpayne@68: LongList list=null; jpayne@68: IntList countList=null; jpayne@68: ArrayList meta=null; jpayne@68: long sum=0; jpayne@68: jpayne@68: String[] split0=sketchString.split("\n"); jpayne@68: for(String line : split0){ jpayne@68: if(line.length()>0){ jpayne@68: // System.err.println("Processing line "+new String(line)); jpayne@68: if(line.charAt(0)=='#'){ jpayne@68: if(line.length()>1 && line.charAt(1)=='#'){ jpayne@68: //ignore jpayne@68: }else if(r16SLen>0 || r18SLen>0){ jpayne@68: byte[] ssu=KillSwitch.copyOfRange(line.getBytes(), 5, line.length()); jpayne@68: if(line.startsWith("#16S:") || line.startsWith("#SSU:")){ jpayne@68: assert(r16SLen>0); jpayne@68: assert(ssu.length==r16SLen) : r16SLen+", "+line.length()+"\n"+line+"\n"; jpayne@68: r16S=ssu; jpayne@68: r16SLen=0; jpayne@68: }else if(line.startsWith("#18S:")){ jpayne@68: assert(r18SLen>0); jpayne@68: assert(ssu.length==r18SLen) : r18SLen+", "+line.length()+"\n"+line+"\n"; jpayne@68: r18S=ssu; jpayne@68: r18SLen=0; jpayne@68: }else{ jpayne@68: assert(false) : line; jpayne@68: } jpayne@68: }else{ jpayne@68: if(list!=null){ jpayne@68: assert(list.size==list.array.length); jpayne@68: if(NUC || unsorted){ jpayne@68: list.sort(); jpayne@68: list.shrinkToUnique(); jpayne@68: }else{ jpayne@68: list.shrink(); jpayne@68: } jpayne@68: if(list.size()>0){ jpayne@68: int[] keyCounts=countList==null ? null : countList.array; jpayne@68: Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, jpayne@68: genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); jpayne@68: sketch.spid=spid; jpayne@68: sketches.add(sketch); jpayne@68: } jpayne@68: // System.err.println("Made sketch "+sketch); jpayne@68: } jpayne@68: name=name0=null; jpayne@68: fname=null; jpayne@68: list=null; jpayne@68: countList=null; jpayne@68: meta=null; jpayne@68: baseCounts=null; jpayne@68: r16S=null; jpayne@68: r18S=null; jpayne@68: r16SLen=0; jpayne@68: r18SLen=0; jpayne@68: sum=0; jpayne@68: taxID=-1; jpayne@68: imgID=-1; jpayne@68: genomeSizeBases=0; jpayne@68: genomeSizeKmers=0; jpayne@68: genomeSequences=0; jpayne@68: probCorrect=-1; jpayne@68: int k_sketch=defaultK; jpayne@68: int k2_sketch=0; jpayne@68: int hashVersion_sketch=1; jpayne@68: jpayne@68: if(line.length()>1){ jpayne@68: String[] split=line.substring(1).split("\t"); jpayne@68: for(String s : split){ jpayne@68: final int colon=s.indexOf(':'); jpayne@68: final String sub=s.substring(colon+1); jpayne@68: if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length jpayne@68: currentSketchSize=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("CD:")){//Coding jpayne@68: A48=HEX=NUC=delta=counts=false; jpayne@68: jpayne@68: for(int i=0; i=0){ jpayne@68: String[] subsplit=sub.split(","); jpayne@68: assert(subsplit.length==2) : "Bad header component "+s; jpayne@68: int x=Integer.parseInt(subsplit[0]); jpayne@68: int y=Integer.parseInt(subsplit[1]); jpayne@68: k_sketch=Tools.max(x, y); jpayne@68: k2_sketch=Tools.min(x, y); jpayne@68: }else{ jpayne@68: k_sketch=Integer.parseInt(s); jpayne@68: k2_sketch=0; jpayne@68: } jpayne@68: }else if(s.startsWith("H:")){//Hash version jpayne@68: hashVersion_sketch=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN jpayne@68: baseCounts=Parse.parseLongArray(sub); jpayne@68: }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases jpayne@68: genomeSizeBases=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers jpayne@68: genomeSizeKmers=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GQ:")){ jpayne@68: genomeSequences=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("GE:")){//Genome size estimate kmers jpayne@68: //ignore jpayne@68: }else if(s.startsWith("PC:")){//Probability of correctness jpayne@68: probCorrect=Float.parseFloat(sub); jpayne@68: }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){ jpayne@68: taxID=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("IMG:")){ jpayne@68: imgID=Long.parseLong(sub); jpayne@68: }else if(s.startsWith("SPID:")){ jpayne@68: spid=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("NM:") || s.startsWith("NAME:")){ jpayne@68: name=sub; jpayne@68: }else if(s.startsWith("FN:")){ jpayne@68: fname=sub; jpayne@68: }else if(s.startsWith("NM0:")){ jpayne@68: name0=sub; jpayne@68: }else if(s.startsWith("MT_")){ jpayne@68: if(meta==null){meta=new ArrayList(1);} jpayne@68: meta.add(s.substring(3)); jpayne@68: }else if(s.startsWith("16S:") || s.startsWith("SSU:")){ jpayne@68: r16SLen=Integer.parseInt(sub); jpayne@68: }else if(s.startsWith("18S:")){ jpayne@68: r18SLen=Integer.parseInt(sub); jpayne@68: }else{ jpayne@68: assert(false) : "Unsupported header tag "+s; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: if(KILL_OK){ jpayne@68: if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line));} jpayne@68: if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line));} jpayne@68: if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+ jpayne@68: " differs from loaded hash version "+HASH_VERSION+".\n" jpayne@68: + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n");} jpayne@68: }else{//Potential hang jpayne@68: assert(k_sketch==k && !NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line); jpayne@68: assert(k2_sketch==k2 && !NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line); jpayne@68: assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+ jpayne@68: " differs from loaded hash version "+HASH_VERSION+".\n" jpayne@68: + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n"; jpayne@68: } jpayne@68: jpayne@68: jpayne@68: if(currentSketchSize>0){ jpayne@68: list=new LongList(currentSketchSize); jpayne@68: if(counts){countList=new IntList(currentSketchSize);} jpayne@68: } jpayne@68: } jpayne@68: }else{ jpayne@68: long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) : jpayne@68: HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Long.parseLong(line)); jpayne@68: // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x)); jpayne@68: sum+=x; jpayne@68: assert(x>=0 || NUC) : x+"\n"+new String(line); jpayne@68: assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it."; jpayne@68: long key=(delta ? sum : x); jpayne@68: if(key>=0){list.add(key);} jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(list!=null){ jpayne@68: assert(list.size==list.array.length || list.size()==0 || NUC || unsorted); jpayne@68: if(NUC || unsorted){ jpayne@68: list.sort(); jpayne@68: list.shrinkToUnique(); jpayne@68: }else{ jpayne@68: list.shrink(); jpayne@68: } jpayne@68: int[] keyCounts=countList==null ? null : countList.array; jpayne@68: Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, jpayne@68: genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); jpayne@68: sketch.spid=spid; jpayne@68: sketches.add(sketch); jpayne@68: } jpayne@68: return sketches; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Threads ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: jpayne@68: jpayne@68: private class LoadThread extends Thread{ jpayne@68: jpayne@68: public LoadThread(ConcurrentLinkedQueue queue_, int mode_, float samplerate_, long reads_, float minEntropy, float minProb, byte minQual) { jpayne@68: queue=queue_; jpayne@68: list=new ArrayList(); jpayne@68: smm=new SketchMakerMini(SketchTool.this, mode_, minEntropy, minProb, minQual); jpayne@68: samplerate=samplerate_; jpayne@68: reads=reads_; jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public void run(){ jpayne@68: success=false; jpayne@68: for(StringNum sn=queue.poll(); sn!=null; sn=queue.poll()){ jpayne@68: ArrayList temp=null; jpayne@68: try { jpayne@68: temp=loadSketchesFromFile(sn.s, smm, 1, reads, smm.mode, samplerate, smm.minEntropy(), smm.minProb(), smm.minQual(), false); jpayne@68: } catch (Throwable e) { jpayne@68: System.err.println("Failure loading "+sn+":\n"+e); jpayne@68: e.printStackTrace(); jpayne@68: success=false; jpayne@68: } jpayne@68: if(temp!=null && temp.size()>0){ jpayne@68: if(smm.mode==PER_FILE){ jpayne@68: // assert(temp.size()==1) : temp.size(); jpayne@68: temp.get(0).sketchID=(int)sn.n; jpayne@68: } jpayne@68: for(Sketch s : temp){add(s);} jpayne@68: } jpayne@68: } jpayne@68: success=true; jpayne@68: } jpayne@68: jpayne@68: private void add(Sketch s){ jpayne@68: if(list!=null){ jpayne@68: list.add(s); jpayne@68: return; jpayne@68: } jpayne@68: assert(false) : "Unsupported."; //The map logic is broken; needs to be synchronized. jpayne@68: // if(s.taxID<0){return;} jpayne@68: //// assert(s.taxID>-1) : s.toHeader(); jpayne@68: // TaxNode tn=tree.getNode(s.taxID); jpayne@68: // while(tn!=null && tn.pid!=tn.id && tn.level queue; jpayne@68: ArrayList list; jpayne@68: boolean success=false; jpayne@68: final SketchMakerMini smm; jpayne@68: final float samplerate; jpayne@68: final long reads; jpayne@68: jpayne@68: // ConcurrentHashMap map; jpayne@68: jpayne@68: } jpayne@68: jpayne@68: private class LoadThread2 extends Thread{ jpayne@68: jpayne@68: LoadThread2(ConcurrentReadInputStream cris_, float minEntropy, float minProb, byte minQual){ jpayne@68: cris=cris_; jpayne@68: smm=new SketchMakerMini(SketchTool.this, ONE_SKETCH, minEntropy, minProb, minQual); jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public void run(){ jpayne@68: jpayne@68: //Grab the first ListNum of reads jpayne@68: ListNum ln=cris.nextList(); jpayne@68: //Grab the actual read list from the ListNum jpayne@68: ArrayList reads=(ln!=null ? ln.list : null); jpayne@68: jpayne@68: //As long as there is a nonempty read list... jpayne@68: while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning jpayne@68: jpayne@68: //Loop through each read in the list jpayne@68: for(int idx=0; idx0){ jpayne@68: if(heap==null){heap=new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts);} jpayne@68: toHeap(table, heap); jpayne@68: }else{ jpayne@68: if(list==null){list=new LongList();} jpayne@68: toList(table, list); jpayne@68: } jpayne@68: tnum=next.getAndIncrement(); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: final AtomicInteger next; jpayne@68: final KmerTableSet kts; jpayne@68: SketchHeap heap; jpayne@68: LongList list; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Read Loading ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public Sketch processReadsMT(String fname, int maxThreads, long reads, int mode, float samplerate, jpayne@68: float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ jpayne@68: if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){ jpayne@68: return processReadsMT(fname.replaceFirst("#", "1"), fname.replaceFirst("#", "2"), maxThreads, reads, mode, samplerate, jpayne@68: minEntropy, minProb, minQual, allowZeroSizeSketch); jpayne@68: }else{ jpayne@68: return processReadsMT(fname, null, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: public Sketch processReadsMT(String fname1, String fname2, int maxThreads, long reads, int mode, float samplerate, jpayne@68: float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ jpayne@68: final FileFormat ffin1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true); jpayne@68: final FileFormat ffin2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true); jpayne@68: return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); jpayne@68: } jpayne@68: jpayne@68: public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){ jpayne@68: return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch); jpayne@68: } jpayne@68: jpayne@68: public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, float samplerate, jpayne@68: float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ jpayne@68: assert(mode==ONE_SKETCH || mode==PER_FILE); jpayne@68: final boolean compressed=ffin1.compressed(); jpayne@68: jpayne@68: maxThreads=Tools.mid(1, maxThreads, Shared.threads()); jpayne@68: jpayne@68: //Create a read input stream jpayne@68: final ConcurrentReadInputStream cris; jpayne@68: String simpleName; jpayne@68: { jpayne@68: simpleName=ffin1.simpleName(); jpayne@68: cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null); jpayne@68: if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);} jpayne@68: cris.start(); //Start the stream jpayne@68: // if(verbose){outstream.println("Started cris");} jpayne@68: } jpayne@68: jpayne@68: //TODO: bgzip actually decompresses fast. jpayne@68: final int threads=(int)Tools.min(maxThreads, jpayne@68: 1.75f*(compressed ? 1 : 2)*(ffin2==null ? 4 : 8)*(mergePairs ? 3 : minEntropy>0 ? 2 : 1)); jpayne@68: jpayne@68: if(verbose2){System.err.println("Starting "+threads+" load threads.");} jpayne@68: ArrayList list=new ArrayList(threads); jpayne@68: for(int i=0; i heaps=new ArrayList(threads); jpayne@68: jpayne@68: for(LoadThread2 pt : list){ jpayne@68: jpayne@68: //Wait until this thread has terminated jpayne@68: while(pt.getState()!=Thread.State.TERMINATED){ jpayne@68: try { jpayne@68: //Attempt a join operation jpayne@68: pt.join(); jpayne@68: } catch (InterruptedException e) { jpayne@68: //Potentially handle this, if it is expected to occur jpayne@68: e.printStackTrace(); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: if(pt.smm.heap!=null && pt.smm.heap.size()>0){ jpayne@68: heaps.add(pt.smm.heap); jpayne@68: } jpayne@68: } jpayne@68: list.clear(); jpayne@68: ReadWrite.closeStream(cris); jpayne@68: jpayne@68: if(verbose2){System.err.println("Generating a sketch by combining thread output.");} jpayne@68: Sketch sketch=toSketch(heaps, allowZeroSizeSketch); jpayne@68: if(verbose2){System.err.println("Resulting sketch: "+((sketch==null) ? "null" : "length="+sketch.length()));} jpayne@68: if(sketch!=null){sketch.setFname(simpleName);} jpayne@68: return sketch; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Writing ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static boolean write(ArrayList sketches, FileFormat ff[]){ jpayne@68: final int len=ff.length; jpayne@68: ByteStreamWriter tsw[]=new ByteStreamWriter[len]; jpayne@68: for(int i=0; i sketches, FileFormat ff){ jpayne@68: final ByteStreamWriter tsw=new ByteStreamWriter(ff); jpayne@68: final ByteBuilder bb=new ByteBuilder(); jpayne@68: tsw.start(); jpayne@68: for(Sketch sketch : sketches){ jpayne@68: write(sketch, tsw, bb); jpayne@68: } jpayne@68: return tsw.poisonAndWait(); jpayne@68: } jpayne@68: jpayne@68: public static boolean write(Sketch sketch, FileFormat ff){ jpayne@68: // System.err.println(ff.name()+", "+new File(ff.name()).exists()); jpayne@68: ByteStreamWriter tsw=new ByteStreamWriter(ff); jpayne@68: // assert(false) : new File(ff.name()).exists(); jpayne@68: tsw.start(); jpayne@68: write(sketch, tsw, null); jpayne@68: return tsw.poisonAndWait(); jpayne@68: } jpayne@68: jpayne@68: public static void write(Sketch sketch, ByteStreamWriter tsw, ByteBuilder bb){ jpayne@68: if(bb==null){bb=new ByteBuilder();} jpayne@68: else{bb.clear();} jpayne@68: sketch.toBytes(bb); jpayne@68: tsw.print(bb); jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: // final EntropyTracker eTracker; jpayne@68: final int stTargetSketchSize; jpayne@68: public final int minKeyOccuranceCount; jpayne@68: /** Force kmer counts to be tracked. */ jpayne@68: public final boolean trackCounts; jpayne@68: /** Merge reads before processing kmers. */ jpayne@68: public final boolean mergePairs; jpayne@68: jpayne@68: public static int BUFLEN=16384; jpayne@68: public static boolean BUFFERED_READER=false; jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Static Fields ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: // public static boolean verbose=false; jpayne@68: jpayne@68: }