comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchTool.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package sketch;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.util.ArrayList;
7 import java.util.Collection;
8 import java.util.concurrent.ConcurrentLinkedQueue;
9 import java.util.concurrent.atomic.AtomicInteger;
10
11 import dna.AminoAcid;
12 import fileIO.ByteFile;
13 import fileIO.ByteStreamWriter;
14 import fileIO.FileFormat;
15 import fileIO.ReadWrite;
16 import kmer.HashArray1D;
17 import kmer.HashForest;
18 import kmer.KmerNode;
19 import kmer.KmerTableSet;
20 import shared.KillSwitch;
21 import shared.Parse;
22 import shared.Shared;
23 import shared.Tools;
24 import stream.ConcurrentReadInputStream;
25 import stream.Read;
26 import structures.ByteBuilder;
27 import structures.IntList;
28 import structures.ListNum;
29 import structures.LongList;
30 import structures.StringNum;
31
32 /**
33 * @author Brian Bushnell
34 * @date June 28, 2016
35 *
36 */
37 public final class SketchTool extends SketchObject {
38
39 /*--------------------------------------------------------------*/
40 /*---------------- Constructor ----------------*/
41 /*--------------------------------------------------------------*/
42
43 public SketchTool(int size_, DisplayParams params){
44 this(size_, params.minKeyOccuranceCount, params.trackCounts(), params.mergePairs);
45 }
46
47 public SketchTool(int size_, int minKeyOccuranceCount_, boolean trackCounts_, boolean mergePairs_){
48 stTargetSketchSize=size_;
49 minKeyOccuranceCount=minKeyOccuranceCount_;
50 trackCounts=trackCounts_;
51 mergePairs=mergePairs_;
52
53 assert(!aminoOrTranslate() || !rcomp) : "rcomp should be false in amino mode.";
54 assert(!aminoOrTranslate() || (k*AminoAcid.AMINO_SHIFT<64)) : "Protein sketches require 1 <= K <= "+(63/AminoAcid.AMINO_SHIFT)+".";
55 assert(k>0 && k<=32) : "Sketches require 1 <= K <= 32."; //123
56 }
57
58 /*--------------------------------------------------------------*/
59 /*---------------- Methods ----------------*/
60 /*--------------------------------------------------------------*/
61
62 public Sketch toSketch(KmerTableSet tables, boolean multithreaded){
63 final int threads=(multithreaded ? Tools.mid(1, Shared.threads(), tables.ways()) : 1);
64 return (threads<2 ? toSketch_ST(tables) : toSketch_MT(tables, threads));
65 }
66
67 private Sketch toSketch_ST(KmerTableSet tables){
68 SketchHeap heap=(stTargetSketchSize>0 ? new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts) : null);
69 LongList list=new LongList();
70
71 KmerTableSet kts=(KmerTableSet)tables;
72 for(int tnum=0; tnum<kts.ways; tnum++){
73 HashArray1D table=kts.getTable(tnum);
74 if(stTargetSketchSize>0){
75 toHeap(table, heap);
76 }else{
77 toList(table, list);
78 }
79 }
80 return stTargetSketchSize>0 ? new Sketch(heap, false, trackCounts, null) : toSketch(list);//TODO: Could add counts here
81 }
82
83 private Sketch toSketch_MT(KmerTableSet tables, final int threads){
84 ArrayList<SketchThread> alst=new ArrayList<SketchThread>(threads);
85 AtomicInteger ai=new AtomicInteger(0);
86 for(int i=0; i<threads; i++){
87 alst.add(new SketchThread(ai, tables));
88 }
89
90 //Start the threads
91 for(SketchThread pt : alst){
92 pt.start();
93 }
94
95 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads);
96 LongList list=new LongList();
97
98 for(SketchThread pt : alst){
99
100 //Wait until this thread has terminated
101 while(pt.getState()!=Thread.State.TERMINATED){
102 try {
103 //Attempt a join operation
104 pt.join();
105 } catch (InterruptedException e) {
106 //Potentially handle this, if it is expected to occur
107 e.printStackTrace();
108 }
109 }
110
111 if(stTargetSketchSize>=0){
112 if(pt.heap!=null && pt.heap.size()>0){
113 heaps.add(pt.heap);
114 }
115 }else{
116 if(pt.list!=null){list.append(pt.list);}
117 pt.list=null;
118 }
119 }
120 alst.clear();
121
122 return stTargetSketchSize>=0 ? toSketch(heaps, true) : toSketch(list);//TODO: Could add counts here
123 }
124
125 public SketchHeap toHeap(HashArray1D table, SketchHeap heap){
126 // if(heap==null){heap=new LongHeap(size, true);}
127 long[] kmers=table.array();
128 int[] counts=table.values();
129 for(int i=0; i<table.arrayLength(); i++){
130 int count=counts[i];
131 if(count>=minKeyOccuranceCount){
132 heap.genomeSizeKmers++;
133 long kmer=kmers[i];
134 long hashcode=hash(kmer);
135 if(hashcode>=minHashValue){
136 heap.add(hashcode);
137 }
138 }
139 }
140 HashForest forest=table.victims();
141 if(forest!=null){
142 for(KmerNode kn : forest.array()){
143 if(kn!=null){addRecursive(heap, kn);}
144 }
145 }
146 return heap;
147 }
148
149 public LongList toList(HashArray1D table, LongList list){
150 // if(heap==null){heap=new LongHeap(size, true);}
151 long[] kmers=table.array();
152 int[] counts=table.values();
153 for(int i=0; i<table.arrayLength(); i++){
154 int count=counts[i];
155 if(count>=minKeyOccuranceCount){
156 long kmer=kmers[i];
157 long hashcode=hash(kmer);
158 if(hashcode>=minHashValue){
159 list.add(hashcode);
160 }
161 }
162 }
163 HashForest forest=table.victims();
164 if(forest!=null){
165 for(KmerNode kn : forest.array()){
166 if(kn!=null){addRecursive(list, kn);}
167 }
168 }
169 return list;
170 }
171
172 // public long[] toSketchArray(ArrayList<LongHeap> heaps){
173 // if(heaps.size()==1){return toSketchArray(heaps.get(0));}
174 // LongList list=new LongList(size);
175 // for(LongHeap heap : heaps){
176 // while(heap.size()>0){list.add(Long.MAX_VALUE-heap.poll());}
177 // }
178 // list.sort();
179 // list.shrinkToUnique();
180 // list.size=Tools.min(size, list.size);
181 // return list.toArray();
182 // }
183
184 public Sketch toSketch(ArrayList<SketchHeap> heaps, boolean allowZeroSizeSketch){
185 if(heaps==null || heaps.isEmpty()){
186 if(allowZeroSizeSketch){
187 return new Sketch(new long[0], null, null, null, null, null);
188 }else{
189 return null;
190 }
191 }
192 SketchHeap a=heaps.get(0);
193 for(int i=1; i<heaps.size(); i++){
194 SketchHeap b=heaps.get(i);
195 a.add(b);
196 }
197 if(verbose2){System.err.println("Creating a sketch of size "+a.size()+".");}
198 return new Sketch(a, false, trackCounts, null);
199 }
200
201 Sketch toSketch(LongList list){
202 list.sort();
203 assert(list.size==0 || list.get(list.size()-1)>=minHashValue) : list.size+", "+list.get(list.size()-1)+", "+minHashValue;
204 list.shrinkToUnique();
205 list.reverse();
206 for(int i=0; i<list.size; i++){list.array[i]=Long.MAX_VALUE-list.array[i];}
207 return new Sketch(list.toArray(), null, null, null, null, null);
208 }
209
210 /*--------------------------------------------------------------*/
211 /*---------------- Helpers ----------------*/
212 /*--------------------------------------------------------------*/
213
214 private void addRecursive(SketchHeap heap, KmerNode kn){
215 if(kn==null){return;}
216 if(kn.count()>=minKeyOccuranceCount){
217 heap.genomeSizeKmers++;
218 long kmer=kn.pivot();
219 long hashcode=hash(kmer);
220 if(hashcode>=minHashValue){heap.add(hashcode);}
221 }
222 if(kn.left()!=null){addRecursive(heap, kn.left());}
223 if(kn.right()!=null){addRecursive(heap, kn.right());}
224 }
225
226 private void addRecursive(LongList list, KmerNode kn){
227 if(kn==null){return;}
228 if(kn.count()>=minKeyOccuranceCount){
229 long kmer=kn.pivot();
230 long hashcode=hash(kmer);
231 if(hashcode>=minHashValue){list.add(hashcode);}
232 }
233 if(kn.left()!=null){addRecursive(list, kn.left());}
234 if(kn.right()!=null){addRecursive(list, kn.right());}
235 }
236
237 /*--------------------------------------------------------------*/
238 /*---------------- I/O ----------------*/
239 /*--------------------------------------------------------------*/
240
241 // public static ArrayList<Sketch> loadSketches_ST(String...fnames){
242 // ArrayList<Sketch> sketches=null;
243 // for(String s : fnames){
244 // ArrayList<Sketch> temp;
245 // if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){
246 // temp=loadSketches(s);
247 // }else{
248 // temp=loadSketches_ST(s.split(","));
249 // }
250 // if(sketches==null){sketches=temp;}
251 // else{sketches.addAll(temp);}
252 // }
253 // return sketches;
254 // }
255
256 // public static ArrayList<Sketch> loadSketches_MT(ArrayList<String> fnames){
257 // return loadSketches_MT(0, null, fnames.toArray(new String[0]));
258 // }
259
260 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, Collection<String> fnames){
261 return loadSketches_MT(params.mode, params.samplerate, params.maxReads,
262 params.minEntropy, params.minProb, params.minQual, fnames);
263 }
264
265 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, String...fnames){
266 return loadSketches_MT(params.mode, params.samplerate, params.maxReads,
267 params.minEntropy, params.minProb, params.minQual, fnames);
268 }
269
270 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, Collection<String> fnames){
271 return loadSketches_MT(mode, samplerate, reads, minEntropy, minProb, minQual, fnames.toArray(new String[0]));
272 }
273
274 //TODO: This is only multithreaded per file in persequence mode.
275 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, String...fnames){
276
277 ConcurrentLinkedQueue<StringNum> decomposedFnames=new ConcurrentLinkedQueue<StringNum>();
278 int num=0;
279 for(String s : fnames){
280 if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){
281 num++;
282 decomposedFnames.add(new StringNum(s, num));
283 }else{
284 for(String s2 : s.split(",")){
285 num++;
286 decomposedFnames.add(new StringNum(s2, num));
287 }
288 }
289 }
290
291 if(decomposedFnames.size()==0){return null;}
292 if(decomposedFnames.size()==1){return loadSketchesFromFile(decomposedFnames.poll().s, null, 0, reads, mode, samplerate, minEntropy, minProb, minQual, false);}
293
294 //Determine how many threads may be used
295 final int threads=Tools.min(Shared.threads(), decomposedFnames.size());
296
297 //Fill a list with LoadThreads
298 ArrayList<LoadThread> allt=new ArrayList<LoadThread>(threads);
299
300 for(int i=0; i<threads; i++){
301 allt.add(new LoadThread(decomposedFnames, mode, samplerate, reads, minEntropy, minProb, minQual));
302 }
303
304 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
305
306 //Start the threads
307 for(LoadThread lt : allt){lt.start();}
308
309 //Wait for completion of all threads
310 boolean success=true;
311 for(LoadThread lt : allt){
312
313 //Wait until this thread has terminated
314 while(lt.getState()!=Thread.State.TERMINATED){
315 try {
316 //Attempt a join operation
317 lt.join();
318 } catch (InterruptedException e) {
319 //Potentially handle this, if it is expected to occur
320 e.printStackTrace();
321 }
322 }
323 sketches.addAll(lt.list);
324 success&=lt.success;
325 }
326 assert(success) : "Failure loading some files.";
327 return sketches;
328 }
329
330 /*--------------------------------------------------------------*/
331
332 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm,
333 int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){
334 return loadSketchesFromFile(fname0, smm, maxThreads, reads, mode,
335 params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch);
336 }
337
338 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm,
339 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
340 assert(fname0!=null);//123
341 if(fname0==null){return null;}
342 FileFormat ff=FileFormat.testInput(fname0, FileFormat.FASTA, null, false, true);
343 if(ff.isSequence()){
344 return loadSketchesFromSequenceFile(ff, smm, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
345 }else{
346 return SketchObject.LOADER2 ? loadSketchesFromSketchFile2(ff, allowZeroSizeSketch) : loadSketchesFromSketchFile(ff, allowZeroSizeSketch);
347 }
348 }
349
350 private ArrayList<Sketch> loadSketchesFromSequenceFile(final FileFormat ff, SketchMakerMini smm,
351 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
352 maxThreads=(maxThreads<1 ? Shared.threads() : Tools.min(maxThreads, Shared.threads()));
353
354 // assert(false) : (ff.fasta() || ff.fastq() || ff.samOrBam())+", "+ff.fastq()+", "+maxThreads+", "+
355 // allowMultithreadedFastq+", "+forceDisableMultithreadedFastq+", "+(mode==ONE_SKETCH);
356
357 if(ff.fastq() && allowMultithreadedFastq && !forceDisableMultithreadedFastq && (mode==ONE_SKETCH || mode==PER_FILE) &&
358 maxThreads>1 && Shared.threads()>2 && (reads<1 || reads*samplerate*(mergePairs ? 2 : 1)>=1000)){//limit is low due to PacBio long reads
359
360 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
361 Read.VALIDATE_IN_CONSTRUCTOR=false;
362
363 if(verbose2){System.err.println("Loading a sketch multithreaded.");}
364 Sketch sketch=processReadsMT(ff.name(), maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
365
366 Read.VALIDATE_IN_CONSTRUCTOR=vic;
367
368 ArrayList<Sketch> list=new ArrayList<Sketch>(1);
369 if(sketch!=null && (sketch.length()>0 || allowZeroSizeSketch)){
370 sketch.loadSSU();
371 list.add(sketch);
372 }
373 return list;
374 }
375 if(smm==null){smm=new SketchMakerMini(this, mode, minEntropy, minProb, minQual);}
376 if(verbose2){System.err.println("Loading sketches via SMM.");}
377 ArrayList<Sketch> sketches=smm.toSketches(ff.name(), samplerate, reads);
378 if(verbose2){System.err.println("Loaded "+(sketches==null ? 0 : sketches.size())+" sketches via SMM.");}
379 return sketches;
380 }
381
382 private ArrayList<Sketch> loadSketchesFromSketchFile(final FileFormat ff, boolean allowZeroSizeSketch){
383
384 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
385
386 if(verbose2){System.err.println("Loading sketches from text.");}
387 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
388 ByteFile bf=ByteFile.makeByteFile(ff);
389 int currentSketchSize=stTargetSketchSize;
390 int taxID=-1;
391 long spid=-1;
392 long imgID=-1;
393 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
394 long[] baseCounts=null;
395 byte[] r16S=null;
396 byte[] r18S=null;
397 int r16SLen=0;
398 int r18SLen=0;
399 float probCorrect=-1;
400 String name=null, name0=null, fname=ff.simpleName();
401 LongList list=null;
402 IntList countList=null;
403 ArrayList<String> meta=null;
404 long sum=0;
405 byte[] lastHeader=null;
406
407 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
408 if(line.length>0){
409 // System.err.println("Processing line "+new String(line));
410 if(line[0]=='#'){
411 if(r16SLen>0 || r18SLen>0){
412 byte[] ssu=KillSwitch.copyOfRange(line, 5, line.length);
413 if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){
414 assert(r16SLen>0);
415 assert(ssu.length==r16SLen) : r16SLen+", "+line.length+"\n"+new String(line)+"\n";
416 r16S=ssu;
417 r16SLen=0;
418 }else if(Tools.startsWith(line, "#18S:")){
419 assert(r18SLen>0);
420 assert(ssu.length==r18SLen) : r18SLen+", "+line.length+"\n"+new String(line)+"\n";
421 r18S=ssu;
422 r18SLen=0;
423 }else{
424 assert(false) : new String(line);
425 }
426 }else{
427 lastHeader=line;
428 if(list!=null){
429 assert(list.size==list.array.length);
430 if(NUC || unsorted){
431 list.sort();
432 list.shrinkToUnique();
433 }else{
434 list.shrink();
435 }
436 if(list.size()>0 || allowZeroSizeSketch){
437 int[] keyCounts=countList==null ? null : countList.array;
438 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
439 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
440 sketch.spid=spid;
441 sketches.add(sketch);
442 }
443 // System.err.println("Made sketch "+sketch);
444 }
445 name=name0=null;
446 fname=ff.simpleName();
447 list=null;
448 countList=null;
449 meta=null;
450 baseCounts=null;
451 r16S=null;
452 r18S=null;
453 r16SLen=0;
454 r18SLen=0;
455 sum=0;
456 taxID=-1;
457 imgID=-1;
458 genomeSizeBases=0;
459 genomeSizeKmers=0;
460 genomeSequences=0;
461 probCorrect=-1;
462 int k_sketch=defaultK;
463 int k2_sketch=0;
464 int hashVersion_sketch=1;
465
466 if(line.length>1){
467 String[] split=new String(line, 1, line.length-1).split("\t");
468 for(String s : split){
469 final int colon=s.indexOf(':');
470 final String sub=s.substring(colon+1);
471 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
472 currentSketchSize=Integer.parseInt(sub);
473 }else if(s.startsWith("CD:")){//Coding
474 A48=HEX=NUC=delta=counts=unsorted=false;
475
476 for(int i=0; i<sub.length(); i++){
477 char c=sub.charAt(i);
478 if(c=='A'){A48=true;}
479 else if(c=='H'){HEX=true;}
480 else if(c=='R'){A48=HEX=false;}
481 else if(c=='N'){NUC=true;}
482 else if(c=='D'){delta=true;}
483 else if(c=='C'){counts=true;}
484 else if(c=='U'){unsorted=true;}
485 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);}
486 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);}
487 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);}
488 }
489 }else if(s.startsWith("K:")){//Kmer length
490 if(sub.indexOf(',')>=0){
491 String[] subsplit=sub.split(",");
492 assert(subsplit.length==2) : "Bad header component "+s;
493 int x=Integer.parseInt(subsplit[0]);
494 int y=Integer.parseInt(subsplit[1]);
495 k_sketch=Tools.max(x, y);
496 k2_sketch=Tools.min(x, y);
497 }else{
498 k_sketch=Integer.parseInt(sub);
499 k2_sketch=0;
500 }
501 }else if(s.startsWith("H:")){//Hash version
502 hashVersion_sketch=Integer.parseInt(sub);
503 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
504 baseCounts=Parse.parseLongArray(sub);
505 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
506 genomeSizeBases=Long.parseLong(sub);
507 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
508 genomeSizeKmers=Long.parseLong(sub);
509 }else if(s.startsWith("GQ:")){
510 genomeSequences=Long.parseLong(sub);
511 }else if(s.startsWith("GE:")){//Genome size estimate kmers
512 //ignore
513 }else if(s.startsWith("PC:")){//Probability of correctness
514 probCorrect=Float.parseFloat(sub);
515 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
516 taxID=Integer.parseInt(sub);
517 }else if(s.startsWith("IMG:")){
518 imgID=Long.parseLong(sub);
519 }else if(s.startsWith("SPID:")){
520 spid=Integer.parseInt(sub);
521 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
522 name=sub;
523 }else if(s.startsWith("FN:")){
524 fname=sub;
525 }else if(s.startsWith("NM0:")){
526 name0=sub;
527 }else if(s.startsWith("MT_")){
528 if(meta==null){meta=new ArrayList<String>(1);}
529 meta.add(s.substring(3));
530 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
531 r16SLen=Integer.parseInt(sub);
532 }else if(s.startsWith("18S:")){
533 r18SLen=Integer.parseInt(sub);
534 }else{
535 assert(false) : "Unsupported header tag "+s;
536 }
537 }
538 }
539
540 if(KILL_OK){
541 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+
542 " differs from loaded kmer length "+k+"\n"+new String(line)+"\nfile: "+ff.name());}
543 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+
544 " differs from loaded kmer length "+k+","+k2+"\n"+new String(line)+"\nfile: "+ff.name());}
545 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
546 " differs from loaded hash version "+HASH_VERSION+".\n"
547 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\nfile: "+ff.name());}
548 }else{//Potential hang
549 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line);
550 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line);
551 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
552 " differs from loaded hash version "+HASH_VERSION+".\n"
553 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n";
554 }
555 if(currentSketchSize>0 || allowZeroSizeSketch){
556 list=new LongList(Tools.max(1, currentSketchSize));
557 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));}
558 }
559 }
560 }else{
561 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) :
562 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Parse.parseLong(line));
563 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
564 sum+=x;
565 assert(x>=0 || NUC) : "x="+x+"\nline="+new String(line)+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\nlineNum="+bf.lineNum()+"\n";
566 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
567 assert(list!=null) : new String(line);
568 long key=(delta ? sum : x);
569 if(key>=0){list.add(key);}
570 }
571 }
572 }
573 if(list!=null && (list.size>0 || allowZeroSizeSketch)){
574 assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : list.size+"!="+list.array.length;
575 if(NUC || unsorted){
576 list.sort();
577 list.shrinkToUnique();
578 }else{
579 list.shrink();
580 }
581 int[] keyCounts=countList==null ? null : countList.array;
582 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
583 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
584 sketch.spid=spid;
585 sketches.add(sketch);
586 }
587 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");}
588 return sketches;
589 }
590
591 /** Usually much faster due to not manifesting the multithreaded load Java slowdown. Should incur less garbage collection also. */
592 private ArrayList<Sketch> loadSketchesFromSketchFile2(final FileFormat ff, boolean allowZeroSizeSketch){
593
594 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
595
596 if(verbose2){System.err.println("Loading sketches from text.");}
597 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
598
599 InputStream is=ReadWrite.getInputStream(ff.name(), BUFFERED_READER, false);
600 byte[] buffer=new byte[BUFLEN];
601 int start, limit=0;
602 try {
603 limit=is.read(buffer);
604 } catch (IOException e) {
605 KillSwitch.exceptionKill(e);
606 }
607
608
609 int currentSketchSize=stTargetSketchSize;
610 int taxID=-1;
611 long spid=-1;
612 long imgID=-1;
613 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
614 long[] baseCounts=null;
615 byte[] r16S=null;
616 byte[] r18S=null;
617 int r16SLen=0;
618 int r18SLen=0;
619 float probCorrect=-1;
620 String name=null, name0=null, fname=ff.simpleName();
621 LongList list=null;
622 IntList countList=null;
623 ArrayList<String> meta=null;
624 long sum=0;
625 byte[] lastHeader=null;
626 ByteBuilder bb=new ByteBuilder(256);
627
628 for(start=0; start<limit;){
629 // System.err.println("Processing line "+new String(line));
630 if(buffer[start]=='#'){
631 bb.clear();
632 try {
633 while(buffer[start]!='\n'){
634 bb.append(buffer[start]);
635 start++;
636 if(start>=limit){start=0; limit=is.read(buffer);}
637 }
638 } catch (IOException e) {
639 KillSwitch.exceptionKill(e);
640 }
641 start++;
642
643 if(r16SLen>0 || r18SLen>0){
644 byte[] ssu=bb.toBytes(5, bb.length());
645 if(bb.startsWith("#16S:") || bb.startsWith("#SSU:")){
646 assert(r16SLen>0);
647 assert(ssu.length==r16SLen) : r16SLen+", "+bb.length+"\n"+bb+"\n";
648 r16S=ssu;
649 r16SLen=0;
650 }else if(bb.startsWith("#18S:")){
651 assert(r18SLen>0);
652 assert(ssu.length==r18SLen) : r18SLen+", "+bb.length+"\n"+bb+"\n";
653 r18S=ssu;
654 r18SLen=0;
655 }else{
656 assert(false) : bb;
657 }
658 }else{
659
660 // byte[] line=lastHeader=bb.toBytes();
661 if(list!=null){
662
663 //This assertion fails sometimes for Silva per-sequence mode, but it's not important
664 // assert(list.size==list.array.length) : list.size+", "+list.array.length+(lastHeader==null ? "" : ", "+new String(lastHeader));
665
666 if(NUC || unsorted){
667 list.sort();
668 list.shrinkToUnique();
669 }else{
670 list.shrink();
671 }
672 if(list.size()>0 || allowZeroSizeSketch){
673 int[] keyCounts=countList==null ? null : countList.array;
674 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
675 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
676 sketch.spid=spid;
677 sketches.add(sketch);
678 }
679 // System.err.println("Made sketch "+sketch);
680 }
681 name=name0=null;
682 fname=ff.simpleName();
683 list=null;
684 countList=null;
685 meta=null;
686 baseCounts=null;
687 r16S=null;
688 r18S=null;
689 r16SLen=0;
690 r18SLen=0;
691 sum=0;
692 taxID=-1;
693 imgID=-1;
694 genomeSizeBases=0;
695 genomeSizeKmers=0;
696 genomeSequences=0;
697 probCorrect=-1;
698 int k_sketch=defaultK;
699 int k2_sketch=0;
700 int hashVersion_sketch=1;
701
702 if(bb.length>1){
703 String[] split=new String(bb.array, 1, bb.length-1).split("\t");
704 for(String s : split){
705 final int colon=s.indexOf(':');
706 final String sub=s.substring(colon+1);
707 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
708 currentSketchSize=Integer.parseInt(sub);
709 }else if(s.startsWith("CD:")){//Coding
710 A48=HEX=NUC=delta=counts=unsorted=false;
711
712 for(int i=0; i<sub.length(); i++){
713 char c=sub.charAt(i);
714 if(c=='A'){A48=true;}
715 else if(c=='H'){HEX=true;}
716 else if(c=='R'){A48=HEX=false;}
717 else if(c=='N'){NUC=true;}
718 else if(c=='D'){delta=true;}
719 else if(c=='C'){counts=true;}
720 else if(c=='U'){unsorted=true;}
721 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+bb;}
722 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+bb;}
723 else{assert(false) : "Unknown coding symbol: "+c+"\t"+bb;}
724 }
725 }else if(s.startsWith("K:")){//Kmer length
726 if(sub.indexOf(',')>=0){
727 String[] subsplit=sub.split(",");
728 assert(subsplit.length==2) : "Bad header component "+s;
729 int x=Integer.parseInt(subsplit[0]);
730 int y=Integer.parseInt(subsplit[1]);
731 k_sketch=Tools.max(x, y);
732 k2_sketch=Tools.min(x, y);
733 }else{
734 k_sketch=Integer.parseInt(sub);
735 k2_sketch=0;
736 }
737 }else if(s.startsWith("H:")){//Hash version
738 hashVersion_sketch=Integer.parseInt(sub);
739 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
740 baseCounts=Parse.parseLongArray(sub);
741 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
742 genomeSizeBases=Long.parseLong(sub);
743 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
744 genomeSizeKmers=Long.parseLong(sub);
745 }else if(s.startsWith("GQ:")){
746 genomeSequences=Long.parseLong(sub);
747 }else if(s.startsWith("GE:")){//Genome size estimate kmers
748 //ignore
749 }else if(s.startsWith("PC:")){//Probability of correctness
750 probCorrect=Float.parseFloat(sub);
751 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
752 taxID=Integer.parseInt(sub);
753 }else if(s.startsWith("IMG:")){
754 imgID=Long.parseLong(sub);
755 }else if(s.startsWith("SPID:")){
756 spid=Integer.parseInt(sub);
757 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
758 name=sub;
759 }else if(s.startsWith("FN:")){
760 fname=sub;
761 }else if(s.startsWith("NM0:")){
762 name0=sub;
763 }else if(s.startsWith("MT_")){
764 if(meta==null){meta=new ArrayList<String>(1);}
765 meta.add(s.substring(3));
766 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
767 r16SLen=Integer.parseInt(sub);
768 }else if(s.startsWith("18S:")){
769 r18SLen=Integer.parseInt(sub);
770 }else{
771 assert(false) : "Unsupported header tag "+s;
772 }
773 }
774 }
775
776 if(KILL_OK){
777 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+
778 " differs from loaded kmer length "+k+"\n"+bb+"\nfile: "+ff.name());}
779 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+
780 " differs from loaded kmer length "+k+","+k2+"\n"+bb+"\nfile: "+ff.name());}
781 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
782 " differs from loaded hash version "+HASH_VERSION+".\n"
783 + "You may need to download the latest version of BBTools.\nfile: "+ff.name());}
784 }else{//Potential hang
785 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+bb;
786 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+bb;
787 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
788 " differs from loaded hash version "+HASH_VERSION+".\n"
789 + "You may need to download the latest version of BBTools.\nfile: "+ff.name();
790 }
791 if(currentSketchSize>0 || allowZeroSizeSketch){
792 list=new LongList(Tools.max(1, currentSketchSize));
793 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));}
794 }
795 }
796 }else{
797 bb.clear();
798 try {
799 while(buffer[start]!='\n'){
800 bb.append(buffer[start]);
801 start++;
802 if(start>=limit){start=0; limit=is.read(buffer);}
803 }
804 } catch (IOException e) {
805 KillSwitch.exceptionKill(e);
806 }
807 start++;
808
809 long x=(counts ? Sketch.parseA48C(bb, countList) : A48 ? Sketch.parseA48(bb) :
810 HEX ? Sketch.parseHex(bb) : NUC ? Sketch.parseNuc(bb) : Parse.parseLong(bb.array, 0, bb.length));
811 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
812 sum+=x;
813 assert(x>=0 || NUC) : "x="+x+"\nline="+bb+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\n";
814 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
815 assert(list!=null) : bb;
816 long key=(delta ? sum : x);
817 if(key>=0){list.add(key);}
818 }
819
820 if(start>=limit){
821 start=0;
822 try {
823 limit=is.read(buffer);
824 } catch (IOException e) {
825 KillSwitch.exceptionKill(e);
826 }
827 }
828 }
829 if(list!=null && (list.size>0 || allowZeroSizeSketch)){
830
831 //This assertion fails sometimes for Silva per-sequence mode, but it's not important
832 // assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) :
833 // list.size+"!="+list.array.length+(lastHeader==null ? "" : "\n"+new String(lastHeader));
834
835 if(NUC || unsorted){
836 list.sort();
837 list.shrinkToUnique();
838 }else{
839 list.shrink();
840 }
841 int[] keyCounts=countList==null ? null : countList.array;
842 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
843 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
844 sketch.spid=spid;
845 sketches.add(sketch);
846 }
847 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");}
848 try {
849 is.close();
850 } catch (IOException e) {
851 e.printStackTrace();
852 }
853 return sketches;
854 }
855
856 /*--------------------------------------------------------------*/
857
858 public ArrayList<Sketch> loadSketchesFromString(String sketchString){
859 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
860
861 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
862 int currentSketchSize=stTargetSketchSize;
863 int taxID=-1;
864 long spid=-1;
865 long imgID=-1;
866 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
867 long[] baseCounts=null;
868 byte[] r16S=null;
869 byte[] r18S=null;
870 int r16SLen=0;
871 int r18SLen=0;
872 float probCorrect=-1;
873 String name=null, name0=null, fname=null;
874 LongList list=null;
875 IntList countList=null;
876 ArrayList<String> meta=null;
877 long sum=0;
878
879 String[] split0=sketchString.split("\n");
880 for(String line : split0){
881 if(line.length()>0){
882 // System.err.println("Processing line "+new String(line));
883 if(line.charAt(0)=='#'){
884 if(line.length()>1 && line.charAt(1)=='#'){
885 //ignore
886 }else if(r16SLen>0 || r18SLen>0){
887 byte[] ssu=KillSwitch.copyOfRange(line.getBytes(), 5, line.length());
888 if(line.startsWith("#16S:") || line.startsWith("#SSU:")){
889 assert(r16SLen>0);
890 assert(ssu.length==r16SLen) : r16SLen+", "+line.length()+"\n"+line+"\n";
891 r16S=ssu;
892 r16SLen=0;
893 }else if(line.startsWith("#18S:")){
894 assert(r18SLen>0);
895 assert(ssu.length==r18SLen) : r18SLen+", "+line.length()+"\n"+line+"\n";
896 r18S=ssu;
897 r18SLen=0;
898 }else{
899 assert(false) : line;
900 }
901 }else{
902 if(list!=null){
903 assert(list.size==list.array.length);
904 if(NUC || unsorted){
905 list.sort();
906 list.shrinkToUnique();
907 }else{
908 list.shrink();
909 }
910 if(list.size()>0){
911 int[] keyCounts=countList==null ? null : countList.array;
912 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
913 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
914 sketch.spid=spid;
915 sketches.add(sketch);
916 }
917 // System.err.println("Made sketch "+sketch);
918 }
919 name=name0=null;
920 fname=null;
921 list=null;
922 countList=null;
923 meta=null;
924 baseCounts=null;
925 r16S=null;
926 r18S=null;
927 r16SLen=0;
928 r18SLen=0;
929 sum=0;
930 taxID=-1;
931 imgID=-1;
932 genomeSizeBases=0;
933 genomeSizeKmers=0;
934 genomeSequences=0;
935 probCorrect=-1;
936 int k_sketch=defaultK;
937 int k2_sketch=0;
938 int hashVersion_sketch=1;
939
940 if(line.length()>1){
941 String[] split=line.substring(1).split("\t");
942 for(String s : split){
943 final int colon=s.indexOf(':');
944 final String sub=s.substring(colon+1);
945 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
946 currentSketchSize=Integer.parseInt(sub);
947 }else if(s.startsWith("CD:")){//Coding
948 A48=HEX=NUC=delta=counts=false;
949
950 for(int i=0; i<sub.length(); i++){
951 char c=sub.charAt(i);
952 if(c=='A'){A48=true;}
953 else if(c=='H'){HEX=true;}
954 else if(c=='R'){A48=HEX=false;}
955 else if(c=='N'){NUC=true;}
956 else if(c=='D'){delta=true;}
957 else if(c=='C'){counts=true;}
958 else if(c=='U'){unsorted=true;}
959 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);}
960 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);}
961 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);}
962 }
963
964 }else if(s.startsWith("K:")){//Kmer length
965 if(sub.indexOf(',')>=0){
966 String[] subsplit=sub.split(",");
967 assert(subsplit.length==2) : "Bad header component "+s;
968 int x=Integer.parseInt(subsplit[0]);
969 int y=Integer.parseInt(subsplit[1]);
970 k_sketch=Tools.max(x, y);
971 k2_sketch=Tools.min(x, y);
972 }else{
973 k_sketch=Integer.parseInt(s);
974 k2_sketch=0;
975 }
976 }else if(s.startsWith("H:")){//Hash version
977 hashVersion_sketch=Integer.parseInt(sub);
978 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
979 baseCounts=Parse.parseLongArray(sub);
980 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
981 genomeSizeBases=Long.parseLong(sub);
982 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
983 genomeSizeKmers=Long.parseLong(sub);
984 }else if(s.startsWith("GQ:")){
985 genomeSequences=Long.parseLong(sub);
986 }else if(s.startsWith("GE:")){//Genome size estimate kmers
987 //ignore
988 }else if(s.startsWith("PC:")){//Probability of correctness
989 probCorrect=Float.parseFloat(sub);
990 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
991 taxID=Integer.parseInt(sub);
992 }else if(s.startsWith("IMG:")){
993 imgID=Long.parseLong(sub);
994 }else if(s.startsWith("SPID:")){
995 spid=Integer.parseInt(sub);
996 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
997 name=sub;
998 }else if(s.startsWith("FN:")){
999 fname=sub;
1000 }else if(s.startsWith("NM0:")){
1001 name0=sub;
1002 }else if(s.startsWith("MT_")){
1003 if(meta==null){meta=new ArrayList<String>(1);}
1004 meta.add(s.substring(3));
1005 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
1006 r16SLen=Integer.parseInt(sub);
1007 }else if(s.startsWith("18S:")){
1008 r18SLen=Integer.parseInt(sub);
1009 }else{
1010 assert(false) : "Unsupported header tag "+s;
1011 }
1012 }
1013 }
1014 if(KILL_OK){
1015 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line));}
1016 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line));}
1017 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
1018 " differs from loaded hash version "+HASH_VERSION+".\n"
1019 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n");}
1020 }else{//Potential hang
1021 assert(k_sketch==k && !NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line);
1022 assert(k2_sketch==k2 && !NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line);
1023 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
1024 " differs from loaded hash version "+HASH_VERSION+".\n"
1025 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n";
1026 }
1027
1028
1029 if(currentSketchSize>0){
1030 list=new LongList(currentSketchSize);
1031 if(counts){countList=new IntList(currentSketchSize);}
1032 }
1033 }
1034 }else{
1035 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) :
1036 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Long.parseLong(line));
1037 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
1038 sum+=x;
1039 assert(x>=0 || NUC) : x+"\n"+new String(line);
1040 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
1041 long key=(delta ? sum : x);
1042 if(key>=0){list.add(key);}
1043 }
1044 }
1045 }
1046
1047 if(list!=null){
1048 assert(list.size==list.array.length || list.size()==0 || NUC || unsorted);
1049 if(NUC || unsorted){
1050 list.sort();
1051 list.shrinkToUnique();
1052 }else{
1053 list.shrink();
1054 }
1055 int[] keyCounts=countList==null ? null : countList.array;
1056 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
1057 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
1058 sketch.spid=spid;
1059 sketches.add(sketch);
1060 }
1061 return sketches;
1062 }
1063
1064 /*--------------------------------------------------------------*/
1065 /*---------------- Threads ----------------*/
1066 /*--------------------------------------------------------------*/
1067
1068
1069
1070 private class LoadThread extends Thread{
1071
1072 public LoadThread(ConcurrentLinkedQueue<StringNum> queue_, int mode_, float samplerate_, long reads_, float minEntropy, float minProb, byte minQual) {
1073 queue=queue_;
1074 list=new ArrayList<Sketch>();
1075 smm=new SketchMakerMini(SketchTool.this, mode_, minEntropy, minProb, minQual);
1076 samplerate=samplerate_;
1077 reads=reads_;
1078 }
1079
1080 @Override
1081 public void run(){
1082 success=false;
1083 for(StringNum sn=queue.poll(); sn!=null; sn=queue.poll()){
1084 ArrayList<Sketch> temp=null;
1085 try {
1086 temp=loadSketchesFromFile(sn.s, smm, 1, reads, smm.mode, samplerate, smm.minEntropy(), smm.minProb(), smm.minQual(), false);
1087 } catch (Throwable e) {
1088 System.err.println("Failure loading "+sn+":\n"+e);
1089 e.printStackTrace();
1090 success=false;
1091 }
1092 if(temp!=null && temp.size()>0){
1093 if(smm.mode==PER_FILE){
1094 // assert(temp.size()==1) : temp.size();
1095 temp.get(0).sketchID=(int)sn.n;
1096 }
1097 for(Sketch s : temp){add(s);}
1098 }
1099 }
1100 success=true;
1101 }
1102
1103 private void add(Sketch s){
1104 if(list!=null){
1105 list.add(s);
1106 return;
1107 }
1108 assert(false) : "Unsupported."; //The map logic is broken; needs to be synchronized.
1109 // if(s.taxID<0){return;}
1110 //// assert(s.taxID>-1) : s.toHeader();
1111 // TaxNode tn=tree.getNode(s.taxID);
1112 // while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){
1113 // TaxNode temp=tree.getNode(tn.pid);
1114 // if(temp==null){break;}
1115 // tn=temp;
1116 // }
1117 // if(tn==null){return;}
1118 // Integer key=tn.id;
1119 // Sketch old=map.get(key);
1120 // if(old==null){
1121 // s.taxID=key;
1122 // map.put(key, s);
1123 // }else{
1124 // synchronized(old){
1125 // old.add(s, maxLen);
1126 // }
1127 // }
1128 }
1129
1130 final ConcurrentLinkedQueue<StringNum> queue;
1131 ArrayList<Sketch> list;
1132 boolean success=false;
1133 final SketchMakerMini smm;
1134 final float samplerate;
1135 final long reads;
1136
1137 // ConcurrentHashMap<Integer, Sketch> map;
1138
1139 }
1140
1141 private class LoadThread2 extends Thread{
1142
1143 LoadThread2(ConcurrentReadInputStream cris_, float minEntropy, float minProb, byte minQual){
1144 cris=cris_;
1145 smm=new SketchMakerMini(SketchTool.this, ONE_SKETCH, minEntropy, minProb, minQual);
1146 }
1147
1148 @Override
1149 public void run(){
1150
1151 //Grab the first ListNum of reads
1152 ListNum<Read> ln=cris.nextList();
1153 //Grab the actual read list from the ListNum
1154 ArrayList<Read> reads=(ln!=null ? ln.list : null);
1155
1156 //As long as there is a nonempty read list...
1157 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
1158
1159 //Loop through each read in the list
1160 for(int idx=0; idx<reads.size(); idx++){
1161 final Read r1=reads.get(idx);
1162 final Read r2=r1.mate;
1163
1164 if(validate){
1165 if(r1!=null){r1.validate(true);}
1166 if(r2!=null){r2.validate(true);}
1167 }
1168
1169 smm.processReadPair(r1, r2);
1170 }
1171
1172 //Notify the input stream that the list was used
1173 cris.returnList(ln);
1174
1175 //Fetch a new list
1176 ln=cris.nextList();
1177 reads=(ln!=null ? ln.list : null);
1178 }
1179
1180 //Notify the input stream that the final list was used
1181 if(ln!=null){
1182 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
1183 }
1184 }
1185
1186 private final boolean validate=!Read.VALIDATE_IN_CONSTRUCTOR;
1187 ConcurrentReadInputStream cris;
1188 SketchMakerMini smm;
1189
1190 }
1191
1192 /** Converts KmerTableSets to Heaps */
1193 private class SketchThread extends Thread {
1194
1195 SketchThread(AtomicInteger next_, KmerTableSet kts_){
1196 next=next_;
1197 kts=kts_;
1198 }
1199
1200 @Override
1201 public void run(){
1202 final int ways=kts.ways();
1203 int tnum=next.getAndIncrement();
1204 while(tnum<ways){
1205 HashArray1D table=kts.getTable(tnum);
1206 if(stTargetSketchSize>0){
1207 if(heap==null){heap=new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts);}
1208 toHeap(table, heap);
1209 }else{
1210 if(list==null){list=new LongList();}
1211 toList(table, list);
1212 }
1213 tnum=next.getAndIncrement();
1214 }
1215 }
1216
1217 final AtomicInteger next;
1218 final KmerTableSet kts;
1219 SketchHeap heap;
1220 LongList list;
1221 }
1222
1223 /*--------------------------------------------------------------*/
1224 /*---------------- Read Loading ----------------*/
1225 /*--------------------------------------------------------------*/
1226
1227 public Sketch processReadsMT(String fname, int maxThreads, long reads, int mode, float samplerate,
1228 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
1229 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
1230 return processReadsMT(fname.replaceFirst("#", "1"), fname.replaceFirst("#", "2"), maxThreads, reads, mode, samplerate,
1231 minEntropy, minProb, minQual, allowZeroSizeSketch);
1232 }else{
1233 return processReadsMT(fname, null, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
1234 }
1235 }
1236
1237 public Sketch processReadsMT(String fname1, String fname2, int maxThreads, long reads, int mode, float samplerate,
1238 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
1239 final FileFormat ffin1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
1240 final FileFormat ffin2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
1241 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
1242 }
1243
1244 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){
1245 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch);
1246 }
1247
1248 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, float samplerate,
1249 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
1250 assert(mode==ONE_SKETCH || mode==PER_FILE);
1251 final boolean compressed=ffin1.compressed();
1252
1253 maxThreads=Tools.mid(1, maxThreads, Shared.threads());
1254
1255 //Create a read input stream
1256 final ConcurrentReadInputStream cris;
1257 String simpleName;
1258 {
1259 simpleName=ffin1.simpleName();
1260 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
1261 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
1262 cris.start(); //Start the stream
1263 // if(verbose){outstream.println("Started cris");}
1264 }
1265
1266 //TODO: bgzip actually decompresses fast.
1267 final int threads=(int)Tools.min(maxThreads,
1268 1.75f*(compressed ? 1 : 2)*(ffin2==null ? 4 : 8)*(mergePairs ? 3 : minEntropy>0 ? 2 : 1));
1269
1270 if(verbose2){System.err.println("Starting "+threads+" load threads.");}
1271 ArrayList<LoadThread2> list=new ArrayList<LoadThread2>(threads);
1272 for(int i=0; i<threads; i++){
1273 list.add(new LoadThread2(cris, minEntropy, minProb, minQual));
1274 list.get(i).start();
1275 }
1276
1277 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads);
1278
1279 for(LoadThread2 pt : list){
1280
1281 //Wait until this thread has terminated
1282 while(pt.getState()!=Thread.State.TERMINATED){
1283 try {
1284 //Attempt a join operation
1285 pt.join();
1286 } catch (InterruptedException e) {
1287 //Potentially handle this, if it is expected to occur
1288 e.printStackTrace();
1289 }
1290 }
1291
1292 if(pt.smm.heap!=null && pt.smm.heap.size()>0){
1293 heaps.add(pt.smm.heap);
1294 }
1295 }
1296 list.clear();
1297 ReadWrite.closeStream(cris);
1298
1299 if(verbose2){System.err.println("Generating a sketch by combining thread output.");}
1300 Sketch sketch=toSketch(heaps, allowZeroSizeSketch);
1301 if(verbose2){System.err.println("Resulting sketch: "+((sketch==null) ? "null" : "length="+sketch.length()));}
1302 if(sketch!=null){sketch.setFname(simpleName);}
1303 return sketch;
1304 }
1305
1306 /*--------------------------------------------------------------*/
1307 /*---------------- Writing ----------------*/
1308 /*--------------------------------------------------------------*/
1309
1310 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff[]){
1311 final int len=ff.length;
1312 ByteStreamWriter tsw[]=new ByteStreamWriter[len];
1313 for(int i=0; i<len; i++){
1314 tsw[i]=new ByteStreamWriter(ff[i]);
1315 tsw[i].start();
1316 }
1317 boolean error=false;
1318 for(int i=0; i<sketches.size(); i++){
1319 write(sketches.get(i), tsw[i%len], new ByteBuilder());
1320 }
1321 for(int i=0; i<len; i++){
1322 error|=tsw[i].poisonAndWait();
1323 }
1324 return error;
1325 }
1326
1327 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff){
1328 final ByteStreamWriter tsw=new ByteStreamWriter(ff);
1329 final ByteBuilder bb=new ByteBuilder();
1330 tsw.start();
1331 for(Sketch sketch : sketches){
1332 write(sketch, tsw, bb);
1333 }
1334 return tsw.poisonAndWait();
1335 }
1336
1337 public static boolean write(Sketch sketch, FileFormat ff){
1338 // System.err.println(ff.name()+", "+new File(ff.name()).exists());
1339 ByteStreamWriter tsw=new ByteStreamWriter(ff);
1340 // assert(false) : new File(ff.name()).exists();
1341 tsw.start();
1342 write(sketch, tsw, null);
1343 return tsw.poisonAndWait();
1344 }
1345
1346 public static void write(Sketch sketch, ByteStreamWriter tsw, ByteBuilder bb){
1347 if(bb==null){bb=new ByteBuilder();}
1348 else{bb.clear();}
1349 sketch.toBytes(bb);
1350 tsw.print(bb);
1351 }
1352
1353 /*--------------------------------------------------------------*/
1354 /*---------------- Fields ----------------*/
1355 /*--------------------------------------------------------------*/
1356
1357 // final EntropyTracker eTracker;
1358 final int stTargetSketchSize;
1359 public final int minKeyOccuranceCount;
1360 /** Force kmer counts to be tracked. */
1361 public final boolean trackCounts;
1362 /** Merge reads before processing kmers. */
1363 public final boolean mergePairs;
1364
1365 public static int BUFLEN=16384;
1366 public static boolean BUFFERED_READER=false;
1367
1368 /*--------------------------------------------------------------*/
1369 /*---------------- Static Fields ----------------*/
1370 /*--------------------------------------------------------------*/
1371
1372 // public static boolean verbose=false;
1373
1374 }