annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchTool.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package sketch;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4 import java.io.IOException;
jpayne@68 5 import java.io.InputStream;
jpayne@68 6 import java.util.ArrayList;
jpayne@68 7 import java.util.Collection;
jpayne@68 8 import java.util.concurrent.ConcurrentLinkedQueue;
jpayne@68 9 import java.util.concurrent.atomic.AtomicInteger;
jpayne@68 10
jpayne@68 11 import dna.AminoAcid;
jpayne@68 12 import fileIO.ByteFile;
jpayne@68 13 import fileIO.ByteStreamWriter;
jpayne@68 14 import fileIO.FileFormat;
jpayne@68 15 import fileIO.ReadWrite;
jpayne@68 16 import kmer.HashArray1D;
jpayne@68 17 import kmer.HashForest;
jpayne@68 18 import kmer.KmerNode;
jpayne@68 19 import kmer.KmerTableSet;
jpayne@68 20 import shared.KillSwitch;
jpayne@68 21 import shared.Parse;
jpayne@68 22 import shared.Shared;
jpayne@68 23 import shared.Tools;
jpayne@68 24 import stream.ConcurrentReadInputStream;
jpayne@68 25 import stream.Read;
jpayne@68 26 import structures.ByteBuilder;
jpayne@68 27 import structures.IntList;
jpayne@68 28 import structures.ListNum;
jpayne@68 29 import structures.LongList;
jpayne@68 30 import structures.StringNum;
jpayne@68 31
jpayne@68 32 /**
jpayne@68 33 * @author Brian Bushnell
jpayne@68 34 * @date June 28, 2016
jpayne@68 35 *
jpayne@68 36 */
jpayne@68 37 public final class SketchTool extends SketchObject {
jpayne@68 38
jpayne@68 39 /*--------------------------------------------------------------*/
jpayne@68 40 /*---------------- Constructor ----------------*/
jpayne@68 41 /*--------------------------------------------------------------*/
jpayne@68 42
jpayne@68 43 public SketchTool(int size_, DisplayParams params){
jpayne@68 44 this(size_, params.minKeyOccuranceCount, params.trackCounts(), params.mergePairs);
jpayne@68 45 }
jpayne@68 46
jpayne@68 47 public SketchTool(int size_, int minKeyOccuranceCount_, boolean trackCounts_, boolean mergePairs_){
jpayne@68 48 stTargetSketchSize=size_;
jpayne@68 49 minKeyOccuranceCount=minKeyOccuranceCount_;
jpayne@68 50 trackCounts=trackCounts_;
jpayne@68 51 mergePairs=mergePairs_;
jpayne@68 52
jpayne@68 53 assert(!aminoOrTranslate() || !rcomp) : "rcomp should be false in amino mode.";
jpayne@68 54 assert(!aminoOrTranslate() || (k*AminoAcid.AMINO_SHIFT<64)) : "Protein sketches require 1 <= K <= "+(63/AminoAcid.AMINO_SHIFT)+".";
jpayne@68 55 assert(k>0 && k<=32) : "Sketches require 1 <= K <= 32."; //123
jpayne@68 56 }
jpayne@68 57
jpayne@68 58 /*--------------------------------------------------------------*/
jpayne@68 59 /*---------------- Methods ----------------*/
jpayne@68 60 /*--------------------------------------------------------------*/
jpayne@68 61
jpayne@68 62 public Sketch toSketch(KmerTableSet tables, boolean multithreaded){
jpayne@68 63 final int threads=(multithreaded ? Tools.mid(1, Shared.threads(), tables.ways()) : 1);
jpayne@68 64 return (threads<2 ? toSketch_ST(tables) : toSketch_MT(tables, threads));
jpayne@68 65 }
jpayne@68 66
jpayne@68 67 private Sketch toSketch_ST(KmerTableSet tables){
jpayne@68 68 SketchHeap heap=(stTargetSketchSize>0 ? new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts) : null);
jpayne@68 69 LongList list=new LongList();
jpayne@68 70
jpayne@68 71 KmerTableSet kts=(KmerTableSet)tables;
jpayne@68 72 for(int tnum=0; tnum<kts.ways; tnum++){
jpayne@68 73 HashArray1D table=kts.getTable(tnum);
jpayne@68 74 if(stTargetSketchSize>0){
jpayne@68 75 toHeap(table, heap);
jpayne@68 76 }else{
jpayne@68 77 toList(table, list);
jpayne@68 78 }
jpayne@68 79 }
jpayne@68 80 return stTargetSketchSize>0 ? new Sketch(heap, false, trackCounts, null) : toSketch(list);//TODO: Could add counts here
jpayne@68 81 }
jpayne@68 82
jpayne@68 83 private Sketch toSketch_MT(KmerTableSet tables, final int threads){
jpayne@68 84 ArrayList<SketchThread> alst=new ArrayList<SketchThread>(threads);
jpayne@68 85 AtomicInteger ai=new AtomicInteger(0);
jpayne@68 86 for(int i=0; i<threads; i++){
jpayne@68 87 alst.add(new SketchThread(ai, tables));
jpayne@68 88 }
jpayne@68 89
jpayne@68 90 //Start the threads
jpayne@68 91 for(SketchThread pt : alst){
jpayne@68 92 pt.start();
jpayne@68 93 }
jpayne@68 94
jpayne@68 95 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads);
jpayne@68 96 LongList list=new LongList();
jpayne@68 97
jpayne@68 98 for(SketchThread pt : alst){
jpayne@68 99
jpayne@68 100 //Wait until this thread has terminated
jpayne@68 101 while(pt.getState()!=Thread.State.TERMINATED){
jpayne@68 102 try {
jpayne@68 103 //Attempt a join operation
jpayne@68 104 pt.join();
jpayne@68 105 } catch (InterruptedException e) {
jpayne@68 106 //Potentially handle this, if it is expected to occur
jpayne@68 107 e.printStackTrace();
jpayne@68 108 }
jpayne@68 109 }
jpayne@68 110
jpayne@68 111 if(stTargetSketchSize>=0){
jpayne@68 112 if(pt.heap!=null && pt.heap.size()>0){
jpayne@68 113 heaps.add(pt.heap);
jpayne@68 114 }
jpayne@68 115 }else{
jpayne@68 116 if(pt.list!=null){list.append(pt.list);}
jpayne@68 117 pt.list=null;
jpayne@68 118 }
jpayne@68 119 }
jpayne@68 120 alst.clear();
jpayne@68 121
jpayne@68 122 return stTargetSketchSize>=0 ? toSketch(heaps, true) : toSketch(list);//TODO: Could add counts here
jpayne@68 123 }
jpayne@68 124
jpayne@68 125 public SketchHeap toHeap(HashArray1D table, SketchHeap heap){
jpayne@68 126 // if(heap==null){heap=new LongHeap(size, true);}
jpayne@68 127 long[] kmers=table.array();
jpayne@68 128 int[] counts=table.values();
jpayne@68 129 for(int i=0; i<table.arrayLength(); i++){
jpayne@68 130 int count=counts[i];
jpayne@68 131 if(count>=minKeyOccuranceCount){
jpayne@68 132 heap.genomeSizeKmers++;
jpayne@68 133 long kmer=kmers[i];
jpayne@68 134 long hashcode=hash(kmer);
jpayne@68 135 if(hashcode>=minHashValue){
jpayne@68 136 heap.add(hashcode);
jpayne@68 137 }
jpayne@68 138 }
jpayne@68 139 }
jpayne@68 140 HashForest forest=table.victims();
jpayne@68 141 if(forest!=null){
jpayne@68 142 for(KmerNode kn : forest.array()){
jpayne@68 143 if(kn!=null){addRecursive(heap, kn);}
jpayne@68 144 }
jpayne@68 145 }
jpayne@68 146 return heap;
jpayne@68 147 }
jpayne@68 148
jpayne@68 149 public LongList toList(HashArray1D table, LongList list){
jpayne@68 150 // if(heap==null){heap=new LongHeap(size, true);}
jpayne@68 151 long[] kmers=table.array();
jpayne@68 152 int[] counts=table.values();
jpayne@68 153 for(int i=0; i<table.arrayLength(); i++){
jpayne@68 154 int count=counts[i];
jpayne@68 155 if(count>=minKeyOccuranceCount){
jpayne@68 156 long kmer=kmers[i];
jpayne@68 157 long hashcode=hash(kmer);
jpayne@68 158 if(hashcode>=minHashValue){
jpayne@68 159 list.add(hashcode);
jpayne@68 160 }
jpayne@68 161 }
jpayne@68 162 }
jpayne@68 163 HashForest forest=table.victims();
jpayne@68 164 if(forest!=null){
jpayne@68 165 for(KmerNode kn : forest.array()){
jpayne@68 166 if(kn!=null){addRecursive(list, kn);}
jpayne@68 167 }
jpayne@68 168 }
jpayne@68 169 return list;
jpayne@68 170 }
jpayne@68 171
jpayne@68 172 // public long[] toSketchArray(ArrayList<LongHeap> heaps){
jpayne@68 173 // if(heaps.size()==1){return toSketchArray(heaps.get(0));}
jpayne@68 174 // LongList list=new LongList(size);
jpayne@68 175 // for(LongHeap heap : heaps){
jpayne@68 176 // while(heap.size()>0){list.add(Long.MAX_VALUE-heap.poll());}
jpayne@68 177 // }
jpayne@68 178 // list.sort();
jpayne@68 179 // list.shrinkToUnique();
jpayne@68 180 // list.size=Tools.min(size, list.size);
jpayne@68 181 // return list.toArray();
jpayne@68 182 // }
jpayne@68 183
jpayne@68 184 public Sketch toSketch(ArrayList<SketchHeap> heaps, boolean allowZeroSizeSketch){
jpayne@68 185 if(heaps==null || heaps.isEmpty()){
jpayne@68 186 if(allowZeroSizeSketch){
jpayne@68 187 return new Sketch(new long[0], null, null, null, null, null);
jpayne@68 188 }else{
jpayne@68 189 return null;
jpayne@68 190 }
jpayne@68 191 }
jpayne@68 192 SketchHeap a=heaps.get(0);
jpayne@68 193 for(int i=1; i<heaps.size(); i++){
jpayne@68 194 SketchHeap b=heaps.get(i);
jpayne@68 195 a.add(b);
jpayne@68 196 }
jpayne@68 197 if(verbose2){System.err.println("Creating a sketch of size "+a.size()+".");}
jpayne@68 198 return new Sketch(a, false, trackCounts, null);
jpayne@68 199 }
jpayne@68 200
jpayne@68 201 Sketch toSketch(LongList list){
jpayne@68 202 list.sort();
jpayne@68 203 assert(list.size==0 || list.get(list.size()-1)>=minHashValue) : list.size+", "+list.get(list.size()-1)+", "+minHashValue;
jpayne@68 204 list.shrinkToUnique();
jpayne@68 205 list.reverse();
jpayne@68 206 for(int i=0; i<list.size; i++){list.array[i]=Long.MAX_VALUE-list.array[i];}
jpayne@68 207 return new Sketch(list.toArray(), null, null, null, null, null);
jpayne@68 208 }
jpayne@68 209
jpayne@68 210 /*--------------------------------------------------------------*/
jpayne@68 211 /*---------------- Helpers ----------------*/
jpayne@68 212 /*--------------------------------------------------------------*/
jpayne@68 213
jpayne@68 214 private void addRecursive(SketchHeap heap, KmerNode kn){
jpayne@68 215 if(kn==null){return;}
jpayne@68 216 if(kn.count()>=minKeyOccuranceCount){
jpayne@68 217 heap.genomeSizeKmers++;
jpayne@68 218 long kmer=kn.pivot();
jpayne@68 219 long hashcode=hash(kmer);
jpayne@68 220 if(hashcode>=minHashValue){heap.add(hashcode);}
jpayne@68 221 }
jpayne@68 222 if(kn.left()!=null){addRecursive(heap, kn.left());}
jpayne@68 223 if(kn.right()!=null){addRecursive(heap, kn.right());}
jpayne@68 224 }
jpayne@68 225
jpayne@68 226 private void addRecursive(LongList list, KmerNode kn){
jpayne@68 227 if(kn==null){return;}
jpayne@68 228 if(kn.count()>=minKeyOccuranceCount){
jpayne@68 229 long kmer=kn.pivot();
jpayne@68 230 long hashcode=hash(kmer);
jpayne@68 231 if(hashcode>=minHashValue){list.add(hashcode);}
jpayne@68 232 }
jpayne@68 233 if(kn.left()!=null){addRecursive(list, kn.left());}
jpayne@68 234 if(kn.right()!=null){addRecursive(list, kn.right());}
jpayne@68 235 }
jpayne@68 236
jpayne@68 237 /*--------------------------------------------------------------*/
jpayne@68 238 /*---------------- I/O ----------------*/
jpayne@68 239 /*--------------------------------------------------------------*/
jpayne@68 240
jpayne@68 241 // public static ArrayList<Sketch> loadSketches_ST(String...fnames){
jpayne@68 242 // ArrayList<Sketch> sketches=null;
jpayne@68 243 // for(String s : fnames){
jpayne@68 244 // ArrayList<Sketch> temp;
jpayne@68 245 // if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){
jpayne@68 246 // temp=loadSketches(s);
jpayne@68 247 // }else{
jpayne@68 248 // temp=loadSketches_ST(s.split(","));
jpayne@68 249 // }
jpayne@68 250 // if(sketches==null){sketches=temp;}
jpayne@68 251 // else{sketches.addAll(temp);}
jpayne@68 252 // }
jpayne@68 253 // return sketches;
jpayne@68 254 // }
jpayne@68 255
jpayne@68 256 // public static ArrayList<Sketch> loadSketches_MT(ArrayList<String> fnames){
jpayne@68 257 // return loadSketches_MT(0, null, fnames.toArray(new String[0]));
jpayne@68 258 // }
jpayne@68 259
jpayne@68 260 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, Collection<String> fnames){
jpayne@68 261 return loadSketches_MT(params.mode, params.samplerate, params.maxReads,
jpayne@68 262 params.minEntropy, params.minProb, params.minQual, fnames);
jpayne@68 263 }
jpayne@68 264
jpayne@68 265 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, String...fnames){
jpayne@68 266 return loadSketches_MT(params.mode, params.samplerate, params.maxReads,
jpayne@68 267 params.minEntropy, params.minProb, params.minQual, fnames);
jpayne@68 268 }
jpayne@68 269
jpayne@68 270 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, Collection<String> fnames){
jpayne@68 271 return loadSketches_MT(mode, samplerate, reads, minEntropy, minProb, minQual, fnames.toArray(new String[0]));
jpayne@68 272 }
jpayne@68 273
jpayne@68 274 //TODO: This is only multithreaded per file in persequence mode.
jpayne@68 275 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, String...fnames){
jpayne@68 276
jpayne@68 277 ConcurrentLinkedQueue<StringNum> decomposedFnames=new ConcurrentLinkedQueue<StringNum>();
jpayne@68 278 int num=0;
jpayne@68 279 for(String s : fnames){
jpayne@68 280 if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){
jpayne@68 281 num++;
jpayne@68 282 decomposedFnames.add(new StringNum(s, num));
jpayne@68 283 }else{
jpayne@68 284 for(String s2 : s.split(",")){
jpayne@68 285 num++;
jpayne@68 286 decomposedFnames.add(new StringNum(s2, num));
jpayne@68 287 }
jpayne@68 288 }
jpayne@68 289 }
jpayne@68 290
jpayne@68 291 if(decomposedFnames.size()==0){return null;}
jpayne@68 292 if(decomposedFnames.size()==1){return loadSketchesFromFile(decomposedFnames.poll().s, null, 0, reads, mode, samplerate, minEntropy, minProb, minQual, false);}
jpayne@68 293
jpayne@68 294 //Determine how many threads may be used
jpayne@68 295 final int threads=Tools.min(Shared.threads(), decomposedFnames.size());
jpayne@68 296
jpayne@68 297 //Fill a list with LoadThreads
jpayne@68 298 ArrayList<LoadThread> allt=new ArrayList<LoadThread>(threads);
jpayne@68 299
jpayne@68 300 for(int i=0; i<threads; i++){
jpayne@68 301 allt.add(new LoadThread(decomposedFnames, mode, samplerate, reads, minEntropy, minProb, minQual));
jpayne@68 302 }
jpayne@68 303
jpayne@68 304 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
jpayne@68 305
jpayne@68 306 //Start the threads
jpayne@68 307 for(LoadThread lt : allt){lt.start();}
jpayne@68 308
jpayne@68 309 //Wait for completion of all threads
jpayne@68 310 boolean success=true;
jpayne@68 311 for(LoadThread lt : allt){
jpayne@68 312
jpayne@68 313 //Wait until this thread has terminated
jpayne@68 314 while(lt.getState()!=Thread.State.TERMINATED){
jpayne@68 315 try {
jpayne@68 316 //Attempt a join operation
jpayne@68 317 lt.join();
jpayne@68 318 } catch (InterruptedException e) {
jpayne@68 319 //Potentially handle this, if it is expected to occur
jpayne@68 320 e.printStackTrace();
jpayne@68 321 }
jpayne@68 322 }
jpayne@68 323 sketches.addAll(lt.list);
jpayne@68 324 success&=lt.success;
jpayne@68 325 }
jpayne@68 326 assert(success) : "Failure loading some files.";
jpayne@68 327 return sketches;
jpayne@68 328 }
jpayne@68 329
jpayne@68 330 /*--------------------------------------------------------------*/
jpayne@68 331
jpayne@68 332 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm,
jpayne@68 333 int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){
jpayne@68 334 return loadSketchesFromFile(fname0, smm, maxThreads, reads, mode,
jpayne@68 335 params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch);
jpayne@68 336 }
jpayne@68 337
jpayne@68 338 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm,
jpayne@68 339 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
jpayne@68 340 assert(fname0!=null);//123
jpayne@68 341 if(fname0==null){return null;}
jpayne@68 342 FileFormat ff=FileFormat.testInput(fname0, FileFormat.FASTA, null, false, true);
jpayne@68 343 if(ff.isSequence()){
jpayne@68 344 return loadSketchesFromSequenceFile(ff, smm, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
jpayne@68 345 }else{
jpayne@68 346 return SketchObject.LOADER2 ? loadSketchesFromSketchFile2(ff, allowZeroSizeSketch) : loadSketchesFromSketchFile(ff, allowZeroSizeSketch);
jpayne@68 347 }
jpayne@68 348 }
jpayne@68 349
jpayne@68 350 private ArrayList<Sketch> loadSketchesFromSequenceFile(final FileFormat ff, SketchMakerMini smm,
jpayne@68 351 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
jpayne@68 352 maxThreads=(maxThreads<1 ? Shared.threads() : Tools.min(maxThreads, Shared.threads()));
jpayne@68 353
jpayne@68 354 // assert(false) : (ff.fasta() || ff.fastq() || ff.samOrBam())+", "+ff.fastq()+", "+maxThreads+", "+
jpayne@68 355 // allowMultithreadedFastq+", "+forceDisableMultithreadedFastq+", "+(mode==ONE_SKETCH);
jpayne@68 356
jpayne@68 357 if(ff.fastq() && allowMultithreadedFastq && !forceDisableMultithreadedFastq && (mode==ONE_SKETCH || mode==PER_FILE) &&
jpayne@68 358 maxThreads>1 && Shared.threads()>2 && (reads<1 || reads*samplerate*(mergePairs ? 2 : 1)>=1000)){//limit is low due to PacBio long reads
jpayne@68 359
jpayne@68 360 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
jpayne@68 361 Read.VALIDATE_IN_CONSTRUCTOR=false;
jpayne@68 362
jpayne@68 363 if(verbose2){System.err.println("Loading a sketch multithreaded.");}
jpayne@68 364 Sketch sketch=processReadsMT(ff.name(), maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
jpayne@68 365
jpayne@68 366 Read.VALIDATE_IN_CONSTRUCTOR=vic;
jpayne@68 367
jpayne@68 368 ArrayList<Sketch> list=new ArrayList<Sketch>(1);
jpayne@68 369 if(sketch!=null && (sketch.length()>0 || allowZeroSizeSketch)){
jpayne@68 370 sketch.loadSSU();
jpayne@68 371 list.add(sketch);
jpayne@68 372 }
jpayne@68 373 return list;
jpayne@68 374 }
jpayne@68 375 if(smm==null){smm=new SketchMakerMini(this, mode, minEntropy, minProb, minQual);}
jpayne@68 376 if(verbose2){System.err.println("Loading sketches via SMM.");}
jpayne@68 377 ArrayList<Sketch> sketches=smm.toSketches(ff.name(), samplerate, reads);
jpayne@68 378 if(verbose2){System.err.println("Loaded "+(sketches==null ? 0 : sketches.size())+" sketches via SMM.");}
jpayne@68 379 return sketches;
jpayne@68 380 }
jpayne@68 381
jpayne@68 382 private ArrayList<Sketch> loadSketchesFromSketchFile(final FileFormat ff, boolean allowZeroSizeSketch){
jpayne@68 383
jpayne@68 384 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
jpayne@68 385
jpayne@68 386 if(verbose2){System.err.println("Loading sketches from text.");}
jpayne@68 387 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
jpayne@68 388 ByteFile bf=ByteFile.makeByteFile(ff);
jpayne@68 389 int currentSketchSize=stTargetSketchSize;
jpayne@68 390 int taxID=-1;
jpayne@68 391 long spid=-1;
jpayne@68 392 long imgID=-1;
jpayne@68 393 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
jpayne@68 394 long[] baseCounts=null;
jpayne@68 395 byte[] r16S=null;
jpayne@68 396 byte[] r18S=null;
jpayne@68 397 int r16SLen=0;
jpayne@68 398 int r18SLen=0;
jpayne@68 399 float probCorrect=-1;
jpayne@68 400 String name=null, name0=null, fname=ff.simpleName();
jpayne@68 401 LongList list=null;
jpayne@68 402 IntList countList=null;
jpayne@68 403 ArrayList<String> meta=null;
jpayne@68 404 long sum=0;
jpayne@68 405 byte[] lastHeader=null;
jpayne@68 406
jpayne@68 407 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
jpayne@68 408 if(line.length>0){
jpayne@68 409 // System.err.println("Processing line "+new String(line));
jpayne@68 410 if(line[0]=='#'){
jpayne@68 411 if(r16SLen>0 || r18SLen>0){
jpayne@68 412 byte[] ssu=KillSwitch.copyOfRange(line, 5, line.length);
jpayne@68 413 if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){
jpayne@68 414 assert(r16SLen>0);
jpayne@68 415 assert(ssu.length==r16SLen) : r16SLen+", "+line.length+"\n"+new String(line)+"\n";
jpayne@68 416 r16S=ssu;
jpayne@68 417 r16SLen=0;
jpayne@68 418 }else if(Tools.startsWith(line, "#18S:")){
jpayne@68 419 assert(r18SLen>0);
jpayne@68 420 assert(ssu.length==r18SLen) : r18SLen+", "+line.length+"\n"+new String(line)+"\n";
jpayne@68 421 r18S=ssu;
jpayne@68 422 r18SLen=0;
jpayne@68 423 }else{
jpayne@68 424 assert(false) : new String(line);
jpayne@68 425 }
jpayne@68 426 }else{
jpayne@68 427 lastHeader=line;
jpayne@68 428 if(list!=null){
jpayne@68 429 assert(list.size==list.array.length);
jpayne@68 430 if(NUC || unsorted){
jpayne@68 431 list.sort();
jpayne@68 432 list.shrinkToUnique();
jpayne@68 433 }else{
jpayne@68 434 list.shrink();
jpayne@68 435 }
jpayne@68 436 if(list.size()>0 || allowZeroSizeSketch){
jpayne@68 437 int[] keyCounts=countList==null ? null : countList.array;
jpayne@68 438 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
jpayne@68 439 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
jpayne@68 440 sketch.spid=spid;
jpayne@68 441 sketches.add(sketch);
jpayne@68 442 }
jpayne@68 443 // System.err.println("Made sketch "+sketch);
jpayne@68 444 }
jpayne@68 445 name=name0=null;
jpayne@68 446 fname=ff.simpleName();
jpayne@68 447 list=null;
jpayne@68 448 countList=null;
jpayne@68 449 meta=null;
jpayne@68 450 baseCounts=null;
jpayne@68 451 r16S=null;
jpayne@68 452 r18S=null;
jpayne@68 453 r16SLen=0;
jpayne@68 454 r18SLen=0;
jpayne@68 455 sum=0;
jpayne@68 456 taxID=-1;
jpayne@68 457 imgID=-1;
jpayne@68 458 genomeSizeBases=0;
jpayne@68 459 genomeSizeKmers=0;
jpayne@68 460 genomeSequences=0;
jpayne@68 461 probCorrect=-1;
jpayne@68 462 int k_sketch=defaultK;
jpayne@68 463 int k2_sketch=0;
jpayne@68 464 int hashVersion_sketch=1;
jpayne@68 465
jpayne@68 466 if(line.length>1){
jpayne@68 467 String[] split=new String(line, 1, line.length-1).split("\t");
jpayne@68 468 for(String s : split){
jpayne@68 469 final int colon=s.indexOf(':');
jpayne@68 470 final String sub=s.substring(colon+1);
jpayne@68 471 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
jpayne@68 472 currentSketchSize=Integer.parseInt(sub);
jpayne@68 473 }else if(s.startsWith("CD:")){//Coding
jpayne@68 474 A48=HEX=NUC=delta=counts=unsorted=false;
jpayne@68 475
jpayne@68 476 for(int i=0; i<sub.length(); i++){
jpayne@68 477 char c=sub.charAt(i);
jpayne@68 478 if(c=='A'){A48=true;}
jpayne@68 479 else if(c=='H'){HEX=true;}
jpayne@68 480 else if(c=='R'){A48=HEX=false;}
jpayne@68 481 else if(c=='N'){NUC=true;}
jpayne@68 482 else if(c=='D'){delta=true;}
jpayne@68 483 else if(c=='C'){counts=true;}
jpayne@68 484 else if(c=='U'){unsorted=true;}
jpayne@68 485 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);}
jpayne@68 486 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);}
jpayne@68 487 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);}
jpayne@68 488 }
jpayne@68 489 }else if(s.startsWith("K:")){//Kmer length
jpayne@68 490 if(sub.indexOf(',')>=0){
jpayne@68 491 String[] subsplit=sub.split(",");
jpayne@68 492 assert(subsplit.length==2) : "Bad header component "+s;
jpayne@68 493 int x=Integer.parseInt(subsplit[0]);
jpayne@68 494 int y=Integer.parseInt(subsplit[1]);
jpayne@68 495 k_sketch=Tools.max(x, y);
jpayne@68 496 k2_sketch=Tools.min(x, y);
jpayne@68 497 }else{
jpayne@68 498 k_sketch=Integer.parseInt(sub);
jpayne@68 499 k2_sketch=0;
jpayne@68 500 }
jpayne@68 501 }else if(s.startsWith("H:")){//Hash version
jpayne@68 502 hashVersion_sketch=Integer.parseInt(sub);
jpayne@68 503 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
jpayne@68 504 baseCounts=Parse.parseLongArray(sub);
jpayne@68 505 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
jpayne@68 506 genomeSizeBases=Long.parseLong(sub);
jpayne@68 507 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
jpayne@68 508 genomeSizeKmers=Long.parseLong(sub);
jpayne@68 509 }else if(s.startsWith("GQ:")){
jpayne@68 510 genomeSequences=Long.parseLong(sub);
jpayne@68 511 }else if(s.startsWith("GE:")){//Genome size estimate kmers
jpayne@68 512 //ignore
jpayne@68 513 }else if(s.startsWith("PC:")){//Probability of correctness
jpayne@68 514 probCorrect=Float.parseFloat(sub);
jpayne@68 515 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
jpayne@68 516 taxID=Integer.parseInt(sub);
jpayne@68 517 }else if(s.startsWith("IMG:")){
jpayne@68 518 imgID=Long.parseLong(sub);
jpayne@68 519 }else if(s.startsWith("SPID:")){
jpayne@68 520 spid=Integer.parseInt(sub);
jpayne@68 521 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
jpayne@68 522 name=sub;
jpayne@68 523 }else if(s.startsWith("FN:")){
jpayne@68 524 fname=sub;
jpayne@68 525 }else if(s.startsWith("NM0:")){
jpayne@68 526 name0=sub;
jpayne@68 527 }else if(s.startsWith("MT_")){
jpayne@68 528 if(meta==null){meta=new ArrayList<String>(1);}
jpayne@68 529 meta.add(s.substring(3));
jpayne@68 530 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
jpayne@68 531 r16SLen=Integer.parseInt(sub);
jpayne@68 532 }else if(s.startsWith("18S:")){
jpayne@68 533 r18SLen=Integer.parseInt(sub);
jpayne@68 534 }else{
jpayne@68 535 assert(false) : "Unsupported header tag "+s;
jpayne@68 536 }
jpayne@68 537 }
jpayne@68 538 }
jpayne@68 539
jpayne@68 540 if(KILL_OK){
jpayne@68 541 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+
jpayne@68 542 " differs from loaded kmer length "+k+"\n"+new String(line)+"\nfile: "+ff.name());}
jpayne@68 543 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+
jpayne@68 544 " differs from loaded kmer length "+k+","+k2+"\n"+new String(line)+"\nfile: "+ff.name());}
jpayne@68 545 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
jpayne@68 546 " differs from loaded hash version "+HASH_VERSION+".\n"
jpayne@68 547 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\nfile: "+ff.name());}
jpayne@68 548 }else{//Potential hang
jpayne@68 549 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line);
jpayne@68 550 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line);
jpayne@68 551 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
jpayne@68 552 " differs from loaded hash version "+HASH_VERSION+".\n"
jpayne@68 553 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n";
jpayne@68 554 }
jpayne@68 555 if(currentSketchSize>0 || allowZeroSizeSketch){
jpayne@68 556 list=new LongList(Tools.max(1, currentSketchSize));
jpayne@68 557 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));}
jpayne@68 558 }
jpayne@68 559 }
jpayne@68 560 }else{
jpayne@68 561 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) :
jpayne@68 562 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Parse.parseLong(line));
jpayne@68 563 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
jpayne@68 564 sum+=x;
jpayne@68 565 assert(x>=0 || NUC) : "x="+x+"\nline="+new String(line)+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\nlineNum="+bf.lineNum()+"\n";
jpayne@68 566 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
jpayne@68 567 assert(list!=null) : new String(line);
jpayne@68 568 long key=(delta ? sum : x);
jpayne@68 569 if(key>=0){list.add(key);}
jpayne@68 570 }
jpayne@68 571 }
jpayne@68 572 }
jpayne@68 573 if(list!=null && (list.size>0 || allowZeroSizeSketch)){
jpayne@68 574 assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : list.size+"!="+list.array.length;
jpayne@68 575 if(NUC || unsorted){
jpayne@68 576 list.sort();
jpayne@68 577 list.shrinkToUnique();
jpayne@68 578 }else{
jpayne@68 579 list.shrink();
jpayne@68 580 }
jpayne@68 581 int[] keyCounts=countList==null ? null : countList.array;
jpayne@68 582 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
jpayne@68 583 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
jpayne@68 584 sketch.spid=spid;
jpayne@68 585 sketches.add(sketch);
jpayne@68 586 }
jpayne@68 587 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");}
jpayne@68 588 return sketches;
jpayne@68 589 }
jpayne@68 590
jpayne@68 591 /** Usually much faster due to not manifesting the multithreaded load Java slowdown. Should incur less garbage collection also. */
jpayne@68 592 private ArrayList<Sketch> loadSketchesFromSketchFile2(final FileFormat ff, boolean allowZeroSizeSketch){
jpayne@68 593
jpayne@68 594 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
jpayne@68 595
jpayne@68 596 if(verbose2){System.err.println("Loading sketches from text.");}
jpayne@68 597 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
jpayne@68 598
jpayne@68 599 InputStream is=ReadWrite.getInputStream(ff.name(), BUFFERED_READER, false);
jpayne@68 600 byte[] buffer=new byte[BUFLEN];
jpayne@68 601 int start, limit=0;
jpayne@68 602 try {
jpayne@68 603 limit=is.read(buffer);
jpayne@68 604 } catch (IOException e) {
jpayne@68 605 KillSwitch.exceptionKill(e);
jpayne@68 606 }
jpayne@68 607
jpayne@68 608
jpayne@68 609 int currentSketchSize=stTargetSketchSize;
jpayne@68 610 int taxID=-1;
jpayne@68 611 long spid=-1;
jpayne@68 612 long imgID=-1;
jpayne@68 613 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
jpayne@68 614 long[] baseCounts=null;
jpayne@68 615 byte[] r16S=null;
jpayne@68 616 byte[] r18S=null;
jpayne@68 617 int r16SLen=0;
jpayne@68 618 int r18SLen=0;
jpayne@68 619 float probCorrect=-1;
jpayne@68 620 String name=null, name0=null, fname=ff.simpleName();
jpayne@68 621 LongList list=null;
jpayne@68 622 IntList countList=null;
jpayne@68 623 ArrayList<String> meta=null;
jpayne@68 624 long sum=0;
jpayne@68 625 byte[] lastHeader=null;
jpayne@68 626 ByteBuilder bb=new ByteBuilder(256);
jpayne@68 627
jpayne@68 628 for(start=0; start<limit;){
jpayne@68 629 // System.err.println("Processing line "+new String(line));
jpayne@68 630 if(buffer[start]=='#'){
jpayne@68 631 bb.clear();
jpayne@68 632 try {
jpayne@68 633 while(buffer[start]!='\n'){
jpayne@68 634 bb.append(buffer[start]);
jpayne@68 635 start++;
jpayne@68 636 if(start>=limit){start=0; limit=is.read(buffer);}
jpayne@68 637 }
jpayne@68 638 } catch (IOException e) {
jpayne@68 639 KillSwitch.exceptionKill(e);
jpayne@68 640 }
jpayne@68 641 start++;
jpayne@68 642
jpayne@68 643 if(r16SLen>0 || r18SLen>0){
jpayne@68 644 byte[] ssu=bb.toBytes(5, bb.length());
jpayne@68 645 if(bb.startsWith("#16S:") || bb.startsWith("#SSU:")){
jpayne@68 646 assert(r16SLen>0);
jpayne@68 647 assert(ssu.length==r16SLen) : r16SLen+", "+bb.length+"\n"+bb+"\n";
jpayne@68 648 r16S=ssu;
jpayne@68 649 r16SLen=0;
jpayne@68 650 }else if(bb.startsWith("#18S:")){
jpayne@68 651 assert(r18SLen>0);
jpayne@68 652 assert(ssu.length==r18SLen) : r18SLen+", "+bb.length+"\n"+bb+"\n";
jpayne@68 653 r18S=ssu;
jpayne@68 654 r18SLen=0;
jpayne@68 655 }else{
jpayne@68 656 assert(false) : bb;
jpayne@68 657 }
jpayne@68 658 }else{
jpayne@68 659
jpayne@68 660 // byte[] line=lastHeader=bb.toBytes();
jpayne@68 661 if(list!=null){
jpayne@68 662
jpayne@68 663 //This assertion fails sometimes for Silva per-sequence mode, but it's not important
jpayne@68 664 // assert(list.size==list.array.length) : list.size+", "+list.array.length+(lastHeader==null ? "" : ", "+new String(lastHeader));
jpayne@68 665
jpayne@68 666 if(NUC || unsorted){
jpayne@68 667 list.sort();
jpayne@68 668 list.shrinkToUnique();
jpayne@68 669 }else{
jpayne@68 670 list.shrink();
jpayne@68 671 }
jpayne@68 672 if(list.size()>0 || allowZeroSizeSketch){
jpayne@68 673 int[] keyCounts=countList==null ? null : countList.array;
jpayne@68 674 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
jpayne@68 675 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
jpayne@68 676 sketch.spid=spid;
jpayne@68 677 sketches.add(sketch);
jpayne@68 678 }
jpayne@68 679 // System.err.println("Made sketch "+sketch);
jpayne@68 680 }
jpayne@68 681 name=name0=null;
jpayne@68 682 fname=ff.simpleName();
jpayne@68 683 list=null;
jpayne@68 684 countList=null;
jpayne@68 685 meta=null;
jpayne@68 686 baseCounts=null;
jpayne@68 687 r16S=null;
jpayne@68 688 r18S=null;
jpayne@68 689 r16SLen=0;
jpayne@68 690 r18SLen=0;
jpayne@68 691 sum=0;
jpayne@68 692 taxID=-1;
jpayne@68 693 imgID=-1;
jpayne@68 694 genomeSizeBases=0;
jpayne@68 695 genomeSizeKmers=0;
jpayne@68 696 genomeSequences=0;
jpayne@68 697 probCorrect=-1;
jpayne@68 698 int k_sketch=defaultK;
jpayne@68 699 int k2_sketch=0;
jpayne@68 700 int hashVersion_sketch=1;
jpayne@68 701
jpayne@68 702 if(bb.length>1){
jpayne@68 703 String[] split=new String(bb.array, 1, bb.length-1).split("\t");
jpayne@68 704 for(String s : split){
jpayne@68 705 final int colon=s.indexOf(':');
jpayne@68 706 final String sub=s.substring(colon+1);
jpayne@68 707 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
jpayne@68 708 currentSketchSize=Integer.parseInt(sub);
jpayne@68 709 }else if(s.startsWith("CD:")){//Coding
jpayne@68 710 A48=HEX=NUC=delta=counts=unsorted=false;
jpayne@68 711
jpayne@68 712 for(int i=0; i<sub.length(); i++){
jpayne@68 713 char c=sub.charAt(i);
jpayne@68 714 if(c=='A'){A48=true;}
jpayne@68 715 else if(c=='H'){HEX=true;}
jpayne@68 716 else if(c=='R'){A48=HEX=false;}
jpayne@68 717 else if(c=='N'){NUC=true;}
jpayne@68 718 else if(c=='D'){delta=true;}
jpayne@68 719 else if(c=='C'){counts=true;}
jpayne@68 720 else if(c=='U'){unsorted=true;}
jpayne@68 721 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+bb;}
jpayne@68 722 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+bb;}
jpayne@68 723 else{assert(false) : "Unknown coding symbol: "+c+"\t"+bb;}
jpayne@68 724 }
jpayne@68 725 }else if(s.startsWith("K:")){//Kmer length
jpayne@68 726 if(sub.indexOf(',')>=0){
jpayne@68 727 String[] subsplit=sub.split(",");
jpayne@68 728 assert(subsplit.length==2) : "Bad header component "+s;
jpayne@68 729 int x=Integer.parseInt(subsplit[0]);
jpayne@68 730 int y=Integer.parseInt(subsplit[1]);
jpayne@68 731 k_sketch=Tools.max(x, y);
jpayne@68 732 k2_sketch=Tools.min(x, y);
jpayne@68 733 }else{
jpayne@68 734 k_sketch=Integer.parseInt(sub);
jpayne@68 735 k2_sketch=0;
jpayne@68 736 }
jpayne@68 737 }else if(s.startsWith("H:")){//Hash version
jpayne@68 738 hashVersion_sketch=Integer.parseInt(sub);
jpayne@68 739 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
jpayne@68 740 baseCounts=Parse.parseLongArray(sub);
jpayne@68 741 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
jpayne@68 742 genomeSizeBases=Long.parseLong(sub);
jpayne@68 743 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
jpayne@68 744 genomeSizeKmers=Long.parseLong(sub);
jpayne@68 745 }else if(s.startsWith("GQ:")){
jpayne@68 746 genomeSequences=Long.parseLong(sub);
jpayne@68 747 }else if(s.startsWith("GE:")){//Genome size estimate kmers
jpayne@68 748 //ignore
jpayne@68 749 }else if(s.startsWith("PC:")){//Probability of correctness
jpayne@68 750 probCorrect=Float.parseFloat(sub);
jpayne@68 751 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
jpayne@68 752 taxID=Integer.parseInt(sub);
jpayne@68 753 }else if(s.startsWith("IMG:")){
jpayne@68 754 imgID=Long.parseLong(sub);
jpayne@68 755 }else if(s.startsWith("SPID:")){
jpayne@68 756 spid=Integer.parseInt(sub);
jpayne@68 757 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
jpayne@68 758 name=sub;
jpayne@68 759 }else if(s.startsWith("FN:")){
jpayne@68 760 fname=sub;
jpayne@68 761 }else if(s.startsWith("NM0:")){
jpayne@68 762 name0=sub;
jpayne@68 763 }else if(s.startsWith("MT_")){
jpayne@68 764 if(meta==null){meta=new ArrayList<String>(1);}
jpayne@68 765 meta.add(s.substring(3));
jpayne@68 766 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
jpayne@68 767 r16SLen=Integer.parseInt(sub);
jpayne@68 768 }else if(s.startsWith("18S:")){
jpayne@68 769 r18SLen=Integer.parseInt(sub);
jpayne@68 770 }else{
jpayne@68 771 assert(false) : "Unsupported header tag "+s;
jpayne@68 772 }
jpayne@68 773 }
jpayne@68 774 }
jpayne@68 775
jpayne@68 776 if(KILL_OK){
jpayne@68 777 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+
jpayne@68 778 " differs from loaded kmer length "+k+"\n"+bb+"\nfile: "+ff.name());}
jpayne@68 779 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+
jpayne@68 780 " differs from loaded kmer length "+k+","+k2+"\n"+bb+"\nfile: "+ff.name());}
jpayne@68 781 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
jpayne@68 782 " differs from loaded hash version "+HASH_VERSION+".\n"
jpayne@68 783 + "You may need to download the latest version of BBTools.\nfile: "+ff.name());}
jpayne@68 784 }else{//Potential hang
jpayne@68 785 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+bb;
jpayne@68 786 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+bb;
jpayne@68 787 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
jpayne@68 788 " differs from loaded hash version "+HASH_VERSION+".\n"
jpayne@68 789 + "You may need to download the latest version of BBTools.\nfile: "+ff.name();
jpayne@68 790 }
jpayne@68 791 if(currentSketchSize>0 || allowZeroSizeSketch){
jpayne@68 792 list=new LongList(Tools.max(1, currentSketchSize));
jpayne@68 793 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));}
jpayne@68 794 }
jpayne@68 795 }
jpayne@68 796 }else{
jpayne@68 797 bb.clear();
jpayne@68 798 try {
jpayne@68 799 while(buffer[start]!='\n'){
jpayne@68 800 bb.append(buffer[start]);
jpayne@68 801 start++;
jpayne@68 802 if(start>=limit){start=0; limit=is.read(buffer);}
jpayne@68 803 }
jpayne@68 804 } catch (IOException e) {
jpayne@68 805 KillSwitch.exceptionKill(e);
jpayne@68 806 }
jpayne@68 807 start++;
jpayne@68 808
jpayne@68 809 long x=(counts ? Sketch.parseA48C(bb, countList) : A48 ? Sketch.parseA48(bb) :
jpayne@68 810 HEX ? Sketch.parseHex(bb) : NUC ? Sketch.parseNuc(bb) : Parse.parseLong(bb.array, 0, bb.length));
jpayne@68 811 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
jpayne@68 812 sum+=x;
jpayne@68 813 assert(x>=0 || NUC) : "x="+x+"\nline="+bb+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\n";
jpayne@68 814 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
jpayne@68 815 assert(list!=null) : bb;
jpayne@68 816 long key=(delta ? sum : x);
jpayne@68 817 if(key>=0){list.add(key);}
jpayne@68 818 }
jpayne@68 819
jpayne@68 820 if(start>=limit){
jpayne@68 821 start=0;
jpayne@68 822 try {
jpayne@68 823 limit=is.read(buffer);
jpayne@68 824 } catch (IOException e) {
jpayne@68 825 KillSwitch.exceptionKill(e);
jpayne@68 826 }
jpayne@68 827 }
jpayne@68 828 }
jpayne@68 829 if(list!=null && (list.size>0 || allowZeroSizeSketch)){
jpayne@68 830
jpayne@68 831 //This assertion fails sometimes for Silva per-sequence mode, but it's not important
jpayne@68 832 // assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) :
jpayne@68 833 // list.size+"!="+list.array.length+(lastHeader==null ? "" : "\n"+new String(lastHeader));
jpayne@68 834
jpayne@68 835 if(NUC || unsorted){
jpayne@68 836 list.sort();
jpayne@68 837 list.shrinkToUnique();
jpayne@68 838 }else{
jpayne@68 839 list.shrink();
jpayne@68 840 }
jpayne@68 841 int[] keyCounts=countList==null ? null : countList.array;
jpayne@68 842 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
jpayne@68 843 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
jpayne@68 844 sketch.spid=spid;
jpayne@68 845 sketches.add(sketch);
jpayne@68 846 }
jpayne@68 847 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");}
jpayne@68 848 try {
jpayne@68 849 is.close();
jpayne@68 850 } catch (IOException e) {
jpayne@68 851 e.printStackTrace();
jpayne@68 852 }
jpayne@68 853 return sketches;
jpayne@68 854 }
jpayne@68 855
jpayne@68 856 /*--------------------------------------------------------------*/
jpayne@68 857
jpayne@68 858 public ArrayList<Sketch> loadSketchesFromString(String sketchString){
jpayne@68 859 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
jpayne@68 860
jpayne@68 861 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
jpayne@68 862 int currentSketchSize=stTargetSketchSize;
jpayne@68 863 int taxID=-1;
jpayne@68 864 long spid=-1;
jpayne@68 865 long imgID=-1;
jpayne@68 866 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
jpayne@68 867 long[] baseCounts=null;
jpayne@68 868 byte[] r16S=null;
jpayne@68 869 byte[] r18S=null;
jpayne@68 870 int r16SLen=0;
jpayne@68 871 int r18SLen=0;
jpayne@68 872 float probCorrect=-1;
jpayne@68 873 String name=null, name0=null, fname=null;
jpayne@68 874 LongList list=null;
jpayne@68 875 IntList countList=null;
jpayne@68 876 ArrayList<String> meta=null;
jpayne@68 877 long sum=0;
jpayne@68 878
jpayne@68 879 String[] split0=sketchString.split("\n");
jpayne@68 880 for(String line : split0){
jpayne@68 881 if(line.length()>0){
jpayne@68 882 // System.err.println("Processing line "+new String(line));
jpayne@68 883 if(line.charAt(0)=='#'){
jpayne@68 884 if(line.length()>1 && line.charAt(1)=='#'){
jpayne@68 885 //ignore
jpayne@68 886 }else if(r16SLen>0 || r18SLen>0){
jpayne@68 887 byte[] ssu=KillSwitch.copyOfRange(line.getBytes(), 5, line.length());
jpayne@68 888 if(line.startsWith("#16S:") || line.startsWith("#SSU:")){
jpayne@68 889 assert(r16SLen>0);
jpayne@68 890 assert(ssu.length==r16SLen) : r16SLen+", "+line.length()+"\n"+line+"\n";
jpayne@68 891 r16S=ssu;
jpayne@68 892 r16SLen=0;
jpayne@68 893 }else if(line.startsWith("#18S:")){
jpayne@68 894 assert(r18SLen>0);
jpayne@68 895 assert(ssu.length==r18SLen) : r18SLen+", "+line.length()+"\n"+line+"\n";
jpayne@68 896 r18S=ssu;
jpayne@68 897 r18SLen=0;
jpayne@68 898 }else{
jpayne@68 899 assert(false) : line;
jpayne@68 900 }
jpayne@68 901 }else{
jpayne@68 902 if(list!=null){
jpayne@68 903 assert(list.size==list.array.length);
jpayne@68 904 if(NUC || unsorted){
jpayne@68 905 list.sort();
jpayne@68 906 list.shrinkToUnique();
jpayne@68 907 }else{
jpayne@68 908 list.shrink();
jpayne@68 909 }
jpayne@68 910 if(list.size()>0){
jpayne@68 911 int[] keyCounts=countList==null ? null : countList.array;
jpayne@68 912 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
jpayne@68 913 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
jpayne@68 914 sketch.spid=spid;
jpayne@68 915 sketches.add(sketch);
jpayne@68 916 }
jpayne@68 917 // System.err.println("Made sketch "+sketch);
jpayne@68 918 }
jpayne@68 919 name=name0=null;
jpayne@68 920 fname=null;
jpayne@68 921 list=null;
jpayne@68 922 countList=null;
jpayne@68 923 meta=null;
jpayne@68 924 baseCounts=null;
jpayne@68 925 r16S=null;
jpayne@68 926 r18S=null;
jpayne@68 927 r16SLen=0;
jpayne@68 928 r18SLen=0;
jpayne@68 929 sum=0;
jpayne@68 930 taxID=-1;
jpayne@68 931 imgID=-1;
jpayne@68 932 genomeSizeBases=0;
jpayne@68 933 genomeSizeKmers=0;
jpayne@68 934 genomeSequences=0;
jpayne@68 935 probCorrect=-1;
jpayne@68 936 int k_sketch=defaultK;
jpayne@68 937 int k2_sketch=0;
jpayne@68 938 int hashVersion_sketch=1;
jpayne@68 939
jpayne@68 940 if(line.length()>1){
jpayne@68 941 String[] split=line.substring(1).split("\t");
jpayne@68 942 for(String s : split){
jpayne@68 943 final int colon=s.indexOf(':');
jpayne@68 944 final String sub=s.substring(colon+1);
jpayne@68 945 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
jpayne@68 946 currentSketchSize=Integer.parseInt(sub);
jpayne@68 947 }else if(s.startsWith("CD:")){//Coding
jpayne@68 948 A48=HEX=NUC=delta=counts=false;
jpayne@68 949
jpayne@68 950 for(int i=0; i<sub.length(); i++){
jpayne@68 951 char c=sub.charAt(i);
jpayne@68 952 if(c=='A'){A48=true;}
jpayne@68 953 else if(c=='H'){HEX=true;}
jpayne@68 954 else if(c=='R'){A48=HEX=false;}
jpayne@68 955 else if(c=='N'){NUC=true;}
jpayne@68 956 else if(c=='D'){delta=true;}
jpayne@68 957 else if(c=='C'){counts=true;}
jpayne@68 958 else if(c=='U'){unsorted=true;}
jpayne@68 959 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);}
jpayne@68 960 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);}
jpayne@68 961 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);}
jpayne@68 962 }
jpayne@68 963
jpayne@68 964 }else if(s.startsWith("K:")){//Kmer length
jpayne@68 965 if(sub.indexOf(',')>=0){
jpayne@68 966 String[] subsplit=sub.split(",");
jpayne@68 967 assert(subsplit.length==2) : "Bad header component "+s;
jpayne@68 968 int x=Integer.parseInt(subsplit[0]);
jpayne@68 969 int y=Integer.parseInt(subsplit[1]);
jpayne@68 970 k_sketch=Tools.max(x, y);
jpayne@68 971 k2_sketch=Tools.min(x, y);
jpayne@68 972 }else{
jpayne@68 973 k_sketch=Integer.parseInt(s);
jpayne@68 974 k2_sketch=0;
jpayne@68 975 }
jpayne@68 976 }else if(s.startsWith("H:")){//Hash version
jpayne@68 977 hashVersion_sketch=Integer.parseInt(sub);
jpayne@68 978 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
jpayne@68 979 baseCounts=Parse.parseLongArray(sub);
jpayne@68 980 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
jpayne@68 981 genomeSizeBases=Long.parseLong(sub);
jpayne@68 982 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
jpayne@68 983 genomeSizeKmers=Long.parseLong(sub);
jpayne@68 984 }else if(s.startsWith("GQ:")){
jpayne@68 985 genomeSequences=Long.parseLong(sub);
jpayne@68 986 }else if(s.startsWith("GE:")){//Genome size estimate kmers
jpayne@68 987 //ignore
jpayne@68 988 }else if(s.startsWith("PC:")){//Probability of correctness
jpayne@68 989 probCorrect=Float.parseFloat(sub);
jpayne@68 990 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
jpayne@68 991 taxID=Integer.parseInt(sub);
jpayne@68 992 }else if(s.startsWith("IMG:")){
jpayne@68 993 imgID=Long.parseLong(sub);
jpayne@68 994 }else if(s.startsWith("SPID:")){
jpayne@68 995 spid=Integer.parseInt(sub);
jpayne@68 996 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
jpayne@68 997 name=sub;
jpayne@68 998 }else if(s.startsWith("FN:")){
jpayne@68 999 fname=sub;
jpayne@68 1000 }else if(s.startsWith("NM0:")){
jpayne@68 1001 name0=sub;
jpayne@68 1002 }else if(s.startsWith("MT_")){
jpayne@68 1003 if(meta==null){meta=new ArrayList<String>(1);}
jpayne@68 1004 meta.add(s.substring(3));
jpayne@68 1005 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
jpayne@68 1006 r16SLen=Integer.parseInt(sub);
jpayne@68 1007 }else if(s.startsWith("18S:")){
jpayne@68 1008 r18SLen=Integer.parseInt(sub);
jpayne@68 1009 }else{
jpayne@68 1010 assert(false) : "Unsupported header tag "+s;
jpayne@68 1011 }
jpayne@68 1012 }
jpayne@68 1013 }
jpayne@68 1014 if(KILL_OK){
jpayne@68 1015 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line));}
jpayne@68 1016 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line));}
jpayne@68 1017 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
jpayne@68 1018 " differs from loaded hash version "+HASH_VERSION+".\n"
jpayne@68 1019 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n");}
jpayne@68 1020 }else{//Potential hang
jpayne@68 1021 assert(k_sketch==k && !NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line);
jpayne@68 1022 assert(k2_sketch==k2 && !NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line);
jpayne@68 1023 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
jpayne@68 1024 " differs from loaded hash version "+HASH_VERSION+".\n"
jpayne@68 1025 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n";
jpayne@68 1026 }
jpayne@68 1027
jpayne@68 1028
jpayne@68 1029 if(currentSketchSize>0){
jpayne@68 1030 list=new LongList(currentSketchSize);
jpayne@68 1031 if(counts){countList=new IntList(currentSketchSize);}
jpayne@68 1032 }
jpayne@68 1033 }
jpayne@68 1034 }else{
jpayne@68 1035 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) :
jpayne@68 1036 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Long.parseLong(line));
jpayne@68 1037 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
jpayne@68 1038 sum+=x;
jpayne@68 1039 assert(x>=0 || NUC) : x+"\n"+new String(line);
jpayne@68 1040 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
jpayne@68 1041 long key=(delta ? sum : x);
jpayne@68 1042 if(key>=0){list.add(key);}
jpayne@68 1043 }
jpayne@68 1044 }
jpayne@68 1045 }
jpayne@68 1046
jpayne@68 1047 if(list!=null){
jpayne@68 1048 assert(list.size==list.array.length || list.size()==0 || NUC || unsorted);
jpayne@68 1049 if(NUC || unsorted){
jpayne@68 1050 list.sort();
jpayne@68 1051 list.shrinkToUnique();
jpayne@68 1052 }else{
jpayne@68 1053 list.shrink();
jpayne@68 1054 }
jpayne@68 1055 int[] keyCounts=countList==null ? null : countList.array;
jpayne@68 1056 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
jpayne@68 1057 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
jpayne@68 1058 sketch.spid=spid;
jpayne@68 1059 sketches.add(sketch);
jpayne@68 1060 }
jpayne@68 1061 return sketches;
jpayne@68 1062 }
jpayne@68 1063
jpayne@68 1064 /*--------------------------------------------------------------*/
jpayne@68 1065 /*---------------- Threads ----------------*/
jpayne@68 1066 /*--------------------------------------------------------------*/
jpayne@68 1067
jpayne@68 1068
jpayne@68 1069
jpayne@68 1070 private class LoadThread extends Thread{
jpayne@68 1071
jpayne@68 1072 public LoadThread(ConcurrentLinkedQueue<StringNum> queue_, int mode_, float samplerate_, long reads_, float minEntropy, float minProb, byte minQual) {
jpayne@68 1073 queue=queue_;
jpayne@68 1074 list=new ArrayList<Sketch>();
jpayne@68 1075 smm=new SketchMakerMini(SketchTool.this, mode_, minEntropy, minProb, minQual);
jpayne@68 1076 samplerate=samplerate_;
jpayne@68 1077 reads=reads_;
jpayne@68 1078 }
jpayne@68 1079
jpayne@68 1080 @Override
jpayne@68 1081 public void run(){
jpayne@68 1082 success=false;
jpayne@68 1083 for(StringNum sn=queue.poll(); sn!=null; sn=queue.poll()){
jpayne@68 1084 ArrayList<Sketch> temp=null;
jpayne@68 1085 try {
jpayne@68 1086 temp=loadSketchesFromFile(sn.s, smm, 1, reads, smm.mode, samplerate, smm.minEntropy(), smm.minProb(), smm.minQual(), false);
jpayne@68 1087 } catch (Throwable e) {
jpayne@68 1088 System.err.println("Failure loading "+sn+":\n"+e);
jpayne@68 1089 e.printStackTrace();
jpayne@68 1090 success=false;
jpayne@68 1091 }
jpayne@68 1092 if(temp!=null && temp.size()>0){
jpayne@68 1093 if(smm.mode==PER_FILE){
jpayne@68 1094 // assert(temp.size()==1) : temp.size();
jpayne@68 1095 temp.get(0).sketchID=(int)sn.n;
jpayne@68 1096 }
jpayne@68 1097 for(Sketch s : temp){add(s);}
jpayne@68 1098 }
jpayne@68 1099 }
jpayne@68 1100 success=true;
jpayne@68 1101 }
jpayne@68 1102
jpayne@68 1103 private void add(Sketch s){
jpayne@68 1104 if(list!=null){
jpayne@68 1105 list.add(s);
jpayne@68 1106 return;
jpayne@68 1107 }
jpayne@68 1108 assert(false) : "Unsupported."; //The map logic is broken; needs to be synchronized.
jpayne@68 1109 // if(s.taxID<0){return;}
jpayne@68 1110 //// assert(s.taxID>-1) : s.toHeader();
jpayne@68 1111 // TaxNode tn=tree.getNode(s.taxID);
jpayne@68 1112 // while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){
jpayne@68 1113 // TaxNode temp=tree.getNode(tn.pid);
jpayne@68 1114 // if(temp==null){break;}
jpayne@68 1115 // tn=temp;
jpayne@68 1116 // }
jpayne@68 1117 // if(tn==null){return;}
jpayne@68 1118 // Integer key=tn.id;
jpayne@68 1119 // Sketch old=map.get(key);
jpayne@68 1120 // if(old==null){
jpayne@68 1121 // s.taxID=key;
jpayne@68 1122 // map.put(key, s);
jpayne@68 1123 // }else{
jpayne@68 1124 // synchronized(old){
jpayne@68 1125 // old.add(s, maxLen);
jpayne@68 1126 // }
jpayne@68 1127 // }
jpayne@68 1128 }
jpayne@68 1129
jpayne@68 1130 final ConcurrentLinkedQueue<StringNum> queue;
jpayne@68 1131 ArrayList<Sketch> list;
jpayne@68 1132 boolean success=false;
jpayne@68 1133 final SketchMakerMini smm;
jpayne@68 1134 final float samplerate;
jpayne@68 1135 final long reads;
jpayne@68 1136
jpayne@68 1137 // ConcurrentHashMap<Integer, Sketch> map;
jpayne@68 1138
jpayne@68 1139 }
jpayne@68 1140
jpayne@68 1141 private class LoadThread2 extends Thread{
jpayne@68 1142
jpayne@68 1143 LoadThread2(ConcurrentReadInputStream cris_, float minEntropy, float minProb, byte minQual){
jpayne@68 1144 cris=cris_;
jpayne@68 1145 smm=new SketchMakerMini(SketchTool.this, ONE_SKETCH, minEntropy, minProb, minQual);
jpayne@68 1146 }
jpayne@68 1147
jpayne@68 1148 @Override
jpayne@68 1149 public void run(){
jpayne@68 1150
jpayne@68 1151 //Grab the first ListNum of reads
jpayne@68 1152 ListNum<Read> ln=cris.nextList();
jpayne@68 1153 //Grab the actual read list from the ListNum
jpayne@68 1154 ArrayList<Read> reads=(ln!=null ? ln.list : null);
jpayne@68 1155
jpayne@68 1156 //As long as there is a nonempty read list...
jpayne@68 1157 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 1158
jpayne@68 1159 //Loop through each read in the list
jpayne@68 1160 for(int idx=0; idx<reads.size(); idx++){
jpayne@68 1161 final Read r1=reads.get(idx);
jpayne@68 1162 final Read r2=r1.mate;
jpayne@68 1163
jpayne@68 1164 if(validate){
jpayne@68 1165 if(r1!=null){r1.validate(true);}
jpayne@68 1166 if(r2!=null){r2.validate(true);}
jpayne@68 1167 }
jpayne@68 1168
jpayne@68 1169 smm.processReadPair(r1, r2);
jpayne@68 1170 }
jpayne@68 1171
jpayne@68 1172 //Notify the input stream that the list was used
jpayne@68 1173 cris.returnList(ln);
jpayne@68 1174
jpayne@68 1175 //Fetch a new list
jpayne@68 1176 ln=cris.nextList();
jpayne@68 1177 reads=(ln!=null ? ln.list : null);
jpayne@68 1178 }
jpayne@68 1179
jpayne@68 1180 //Notify the input stream that the final list was used
jpayne@68 1181 if(ln!=null){
jpayne@68 1182 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
jpayne@68 1183 }
jpayne@68 1184 }
jpayne@68 1185
jpayne@68 1186 private final boolean validate=!Read.VALIDATE_IN_CONSTRUCTOR;
jpayne@68 1187 ConcurrentReadInputStream cris;
jpayne@68 1188 SketchMakerMini smm;
jpayne@68 1189
jpayne@68 1190 }
jpayne@68 1191
jpayne@68 1192 /** Converts KmerTableSets to Heaps */
jpayne@68 1193 private class SketchThread extends Thread {
jpayne@68 1194
jpayne@68 1195 SketchThread(AtomicInteger next_, KmerTableSet kts_){
jpayne@68 1196 next=next_;
jpayne@68 1197 kts=kts_;
jpayne@68 1198 }
jpayne@68 1199
jpayne@68 1200 @Override
jpayne@68 1201 public void run(){
jpayne@68 1202 final int ways=kts.ways();
jpayne@68 1203 int tnum=next.getAndIncrement();
jpayne@68 1204 while(tnum<ways){
jpayne@68 1205 HashArray1D table=kts.getTable(tnum);
jpayne@68 1206 if(stTargetSketchSize>0){
jpayne@68 1207 if(heap==null){heap=new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts);}
jpayne@68 1208 toHeap(table, heap);
jpayne@68 1209 }else{
jpayne@68 1210 if(list==null){list=new LongList();}
jpayne@68 1211 toList(table, list);
jpayne@68 1212 }
jpayne@68 1213 tnum=next.getAndIncrement();
jpayne@68 1214 }
jpayne@68 1215 }
jpayne@68 1216
jpayne@68 1217 final AtomicInteger next;
jpayne@68 1218 final KmerTableSet kts;
jpayne@68 1219 SketchHeap heap;
jpayne@68 1220 LongList list;
jpayne@68 1221 }
jpayne@68 1222
jpayne@68 1223 /*--------------------------------------------------------------*/
jpayne@68 1224 /*---------------- Read Loading ----------------*/
jpayne@68 1225 /*--------------------------------------------------------------*/
jpayne@68 1226
jpayne@68 1227 public Sketch processReadsMT(String fname, int maxThreads, long reads, int mode, float samplerate,
jpayne@68 1228 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
jpayne@68 1229 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
jpayne@68 1230 return processReadsMT(fname.replaceFirst("#", "1"), fname.replaceFirst("#", "2"), maxThreads, reads, mode, samplerate,
jpayne@68 1231 minEntropy, minProb, minQual, allowZeroSizeSketch);
jpayne@68 1232 }else{
jpayne@68 1233 return processReadsMT(fname, null, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
jpayne@68 1234 }
jpayne@68 1235 }
jpayne@68 1236
jpayne@68 1237 public Sketch processReadsMT(String fname1, String fname2, int maxThreads, long reads, int mode, float samplerate,
jpayne@68 1238 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
jpayne@68 1239 final FileFormat ffin1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
jpayne@68 1240 final FileFormat ffin2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
jpayne@68 1241 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
jpayne@68 1242 }
jpayne@68 1243
jpayne@68 1244 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){
jpayne@68 1245 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch);
jpayne@68 1246 }
jpayne@68 1247
jpayne@68 1248 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, float samplerate,
jpayne@68 1249 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
jpayne@68 1250 assert(mode==ONE_SKETCH || mode==PER_FILE);
jpayne@68 1251 final boolean compressed=ffin1.compressed();
jpayne@68 1252
jpayne@68 1253 maxThreads=Tools.mid(1, maxThreads, Shared.threads());
jpayne@68 1254
jpayne@68 1255 //Create a read input stream
jpayne@68 1256 final ConcurrentReadInputStream cris;
jpayne@68 1257 String simpleName;
jpayne@68 1258 {
jpayne@68 1259 simpleName=ffin1.simpleName();
jpayne@68 1260 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
jpayne@68 1261 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
jpayne@68 1262 cris.start(); //Start the stream
jpayne@68 1263 // if(verbose){outstream.println("Started cris");}
jpayne@68 1264 }
jpayne@68 1265
jpayne@68 1266 //TODO: bgzip actually decompresses fast.
jpayne@68 1267 final int threads=(int)Tools.min(maxThreads,
jpayne@68 1268 1.75f*(compressed ? 1 : 2)*(ffin2==null ? 4 : 8)*(mergePairs ? 3 : minEntropy>0 ? 2 : 1));
jpayne@68 1269
jpayne@68 1270 if(verbose2){System.err.println("Starting "+threads+" load threads.");}
jpayne@68 1271 ArrayList<LoadThread2> list=new ArrayList<LoadThread2>(threads);
jpayne@68 1272 for(int i=0; i<threads; i++){
jpayne@68 1273 list.add(new LoadThread2(cris, minEntropy, minProb, minQual));
jpayne@68 1274 list.get(i).start();
jpayne@68 1275 }
jpayne@68 1276
jpayne@68 1277 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads);
jpayne@68 1278
jpayne@68 1279 for(LoadThread2 pt : list){
jpayne@68 1280
jpayne@68 1281 //Wait until this thread has terminated
jpayne@68 1282 while(pt.getState()!=Thread.State.TERMINATED){
jpayne@68 1283 try {
jpayne@68 1284 //Attempt a join operation
jpayne@68 1285 pt.join();
jpayne@68 1286 } catch (InterruptedException e) {
jpayne@68 1287 //Potentially handle this, if it is expected to occur
jpayne@68 1288 e.printStackTrace();
jpayne@68 1289 }
jpayne@68 1290 }
jpayne@68 1291
jpayne@68 1292 if(pt.smm.heap!=null && pt.smm.heap.size()>0){
jpayne@68 1293 heaps.add(pt.smm.heap);
jpayne@68 1294 }
jpayne@68 1295 }
jpayne@68 1296 list.clear();
jpayne@68 1297 ReadWrite.closeStream(cris);
jpayne@68 1298
jpayne@68 1299 if(verbose2){System.err.println("Generating a sketch by combining thread output.");}
jpayne@68 1300 Sketch sketch=toSketch(heaps, allowZeroSizeSketch);
jpayne@68 1301 if(verbose2){System.err.println("Resulting sketch: "+((sketch==null) ? "null" : "length="+sketch.length()));}
jpayne@68 1302 if(sketch!=null){sketch.setFname(simpleName);}
jpayne@68 1303 return sketch;
jpayne@68 1304 }
jpayne@68 1305
jpayne@68 1306 /*--------------------------------------------------------------*/
jpayne@68 1307 /*---------------- Writing ----------------*/
jpayne@68 1308 /*--------------------------------------------------------------*/
jpayne@68 1309
jpayne@68 1310 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff[]){
jpayne@68 1311 final int len=ff.length;
jpayne@68 1312 ByteStreamWriter tsw[]=new ByteStreamWriter[len];
jpayne@68 1313 for(int i=0; i<len; i++){
jpayne@68 1314 tsw[i]=new ByteStreamWriter(ff[i]);
jpayne@68 1315 tsw[i].start();
jpayne@68 1316 }
jpayne@68 1317 boolean error=false;
jpayne@68 1318 for(int i=0; i<sketches.size(); i++){
jpayne@68 1319 write(sketches.get(i), tsw[i%len], new ByteBuilder());
jpayne@68 1320 }
jpayne@68 1321 for(int i=0; i<len; i++){
jpayne@68 1322 error|=tsw[i].poisonAndWait();
jpayne@68 1323 }
jpayne@68 1324 return error;
jpayne@68 1325 }
jpayne@68 1326
jpayne@68 1327 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff){
jpayne@68 1328 final ByteStreamWriter tsw=new ByteStreamWriter(ff);
jpayne@68 1329 final ByteBuilder bb=new ByteBuilder();
jpayne@68 1330 tsw.start();
jpayne@68 1331 for(Sketch sketch : sketches){
jpayne@68 1332 write(sketch, tsw, bb);
jpayne@68 1333 }
jpayne@68 1334 return tsw.poisonAndWait();
jpayne@68 1335 }
jpayne@68 1336
jpayne@68 1337 public static boolean write(Sketch sketch, FileFormat ff){
jpayne@68 1338 // System.err.println(ff.name()+", "+new File(ff.name()).exists());
jpayne@68 1339 ByteStreamWriter tsw=new ByteStreamWriter(ff);
jpayne@68 1340 // assert(false) : new File(ff.name()).exists();
jpayne@68 1341 tsw.start();
jpayne@68 1342 write(sketch, tsw, null);
jpayne@68 1343 return tsw.poisonAndWait();
jpayne@68 1344 }
jpayne@68 1345
jpayne@68 1346 public static void write(Sketch sketch, ByteStreamWriter tsw, ByteBuilder bb){
jpayne@68 1347 if(bb==null){bb=new ByteBuilder();}
jpayne@68 1348 else{bb.clear();}
jpayne@68 1349 sketch.toBytes(bb);
jpayne@68 1350 tsw.print(bb);
jpayne@68 1351 }
jpayne@68 1352
jpayne@68 1353 /*--------------------------------------------------------------*/
jpayne@68 1354 /*---------------- Fields ----------------*/
jpayne@68 1355 /*--------------------------------------------------------------*/
jpayne@68 1356
jpayne@68 1357 // final EntropyTracker eTracker;
jpayne@68 1358 final int stTargetSketchSize;
jpayne@68 1359 public final int minKeyOccuranceCount;
jpayne@68 1360 /** Force kmer counts to be tracked. */
jpayne@68 1361 public final boolean trackCounts;
jpayne@68 1362 /** Merge reads before processing kmers. */
jpayne@68 1363 public final boolean mergePairs;
jpayne@68 1364
jpayne@68 1365 public static int BUFLEN=16384;
jpayne@68 1366 public static boolean BUFFERED_READER=false;
jpayne@68 1367
jpayne@68 1368 /*--------------------------------------------------------------*/
jpayne@68 1369 /*---------------- Static Fields ----------------*/
jpayne@68 1370 /*--------------------------------------------------------------*/
jpayne@68 1371
jpayne@68 1372 // public static boolean verbose=false;
jpayne@68 1373
jpayne@68 1374 }