Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchTool.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package sketch; | |
2 | |
3 import java.io.File; | |
4 import java.io.IOException; | |
5 import java.io.InputStream; | |
6 import java.util.ArrayList; | |
7 import java.util.Collection; | |
8 import java.util.concurrent.ConcurrentLinkedQueue; | |
9 import java.util.concurrent.atomic.AtomicInteger; | |
10 | |
11 import dna.AminoAcid; | |
12 import fileIO.ByteFile; | |
13 import fileIO.ByteStreamWriter; | |
14 import fileIO.FileFormat; | |
15 import fileIO.ReadWrite; | |
16 import kmer.HashArray1D; | |
17 import kmer.HashForest; | |
18 import kmer.KmerNode; | |
19 import kmer.KmerTableSet; | |
20 import shared.KillSwitch; | |
21 import shared.Parse; | |
22 import shared.Shared; | |
23 import shared.Tools; | |
24 import stream.ConcurrentReadInputStream; | |
25 import stream.Read; | |
26 import structures.ByteBuilder; | |
27 import structures.IntList; | |
28 import structures.ListNum; | |
29 import structures.LongList; | |
30 import structures.StringNum; | |
31 | |
32 /** | |
33 * @author Brian Bushnell | |
34 * @date June 28, 2016 | |
35 * | |
36 */ | |
37 public final class SketchTool extends SketchObject { | |
38 | |
39 /*--------------------------------------------------------------*/ | |
40 /*---------------- Constructor ----------------*/ | |
41 /*--------------------------------------------------------------*/ | |
42 | |
43 public SketchTool(int size_, DisplayParams params){ | |
44 this(size_, params.minKeyOccuranceCount, params.trackCounts(), params.mergePairs); | |
45 } | |
46 | |
47 public SketchTool(int size_, int minKeyOccuranceCount_, boolean trackCounts_, boolean mergePairs_){ | |
48 stTargetSketchSize=size_; | |
49 minKeyOccuranceCount=minKeyOccuranceCount_; | |
50 trackCounts=trackCounts_; | |
51 mergePairs=mergePairs_; | |
52 | |
53 assert(!aminoOrTranslate() || !rcomp) : "rcomp should be false in amino mode."; | |
54 assert(!aminoOrTranslate() || (k*AminoAcid.AMINO_SHIFT<64)) : "Protein sketches require 1 <= K <= "+(63/AminoAcid.AMINO_SHIFT)+"."; | |
55 assert(k>0 && k<=32) : "Sketches require 1 <= K <= 32."; //123 | |
56 } | |
57 | |
58 /*--------------------------------------------------------------*/ | |
59 /*---------------- Methods ----------------*/ | |
60 /*--------------------------------------------------------------*/ | |
61 | |
62 public Sketch toSketch(KmerTableSet tables, boolean multithreaded){ | |
63 final int threads=(multithreaded ? Tools.mid(1, Shared.threads(), tables.ways()) : 1); | |
64 return (threads<2 ? toSketch_ST(tables) : toSketch_MT(tables, threads)); | |
65 } | |
66 | |
67 private Sketch toSketch_ST(KmerTableSet tables){ | |
68 SketchHeap heap=(stTargetSketchSize>0 ? new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts) : null); | |
69 LongList list=new LongList(); | |
70 | |
71 KmerTableSet kts=(KmerTableSet)tables; | |
72 for(int tnum=0; tnum<kts.ways; tnum++){ | |
73 HashArray1D table=kts.getTable(tnum); | |
74 if(stTargetSketchSize>0){ | |
75 toHeap(table, heap); | |
76 }else{ | |
77 toList(table, list); | |
78 } | |
79 } | |
80 return stTargetSketchSize>0 ? new Sketch(heap, false, trackCounts, null) : toSketch(list);//TODO: Could add counts here | |
81 } | |
82 | |
83 private Sketch toSketch_MT(KmerTableSet tables, final int threads){ | |
84 ArrayList<SketchThread> alst=new ArrayList<SketchThread>(threads); | |
85 AtomicInteger ai=new AtomicInteger(0); | |
86 for(int i=0; i<threads; i++){ | |
87 alst.add(new SketchThread(ai, tables)); | |
88 } | |
89 | |
90 //Start the threads | |
91 for(SketchThread pt : alst){ | |
92 pt.start(); | |
93 } | |
94 | |
95 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads); | |
96 LongList list=new LongList(); | |
97 | |
98 for(SketchThread pt : alst){ | |
99 | |
100 //Wait until this thread has terminated | |
101 while(pt.getState()!=Thread.State.TERMINATED){ | |
102 try { | |
103 //Attempt a join operation | |
104 pt.join(); | |
105 } catch (InterruptedException e) { | |
106 //Potentially handle this, if it is expected to occur | |
107 e.printStackTrace(); | |
108 } | |
109 } | |
110 | |
111 if(stTargetSketchSize>=0){ | |
112 if(pt.heap!=null && pt.heap.size()>0){ | |
113 heaps.add(pt.heap); | |
114 } | |
115 }else{ | |
116 if(pt.list!=null){list.append(pt.list);} | |
117 pt.list=null; | |
118 } | |
119 } | |
120 alst.clear(); | |
121 | |
122 return stTargetSketchSize>=0 ? toSketch(heaps, true) : toSketch(list);//TODO: Could add counts here | |
123 } | |
124 | |
125 public SketchHeap toHeap(HashArray1D table, SketchHeap heap){ | |
126 // if(heap==null){heap=new LongHeap(size, true);} | |
127 long[] kmers=table.array(); | |
128 int[] counts=table.values(); | |
129 for(int i=0; i<table.arrayLength(); i++){ | |
130 int count=counts[i]; | |
131 if(count>=minKeyOccuranceCount){ | |
132 heap.genomeSizeKmers++; | |
133 long kmer=kmers[i]; | |
134 long hashcode=hash(kmer); | |
135 if(hashcode>=minHashValue){ | |
136 heap.add(hashcode); | |
137 } | |
138 } | |
139 } | |
140 HashForest forest=table.victims(); | |
141 if(forest!=null){ | |
142 for(KmerNode kn : forest.array()){ | |
143 if(kn!=null){addRecursive(heap, kn);} | |
144 } | |
145 } | |
146 return heap; | |
147 } | |
148 | |
149 public LongList toList(HashArray1D table, LongList list){ | |
150 // if(heap==null){heap=new LongHeap(size, true);} | |
151 long[] kmers=table.array(); | |
152 int[] counts=table.values(); | |
153 for(int i=0; i<table.arrayLength(); i++){ | |
154 int count=counts[i]; | |
155 if(count>=minKeyOccuranceCount){ | |
156 long kmer=kmers[i]; | |
157 long hashcode=hash(kmer); | |
158 if(hashcode>=minHashValue){ | |
159 list.add(hashcode); | |
160 } | |
161 } | |
162 } | |
163 HashForest forest=table.victims(); | |
164 if(forest!=null){ | |
165 for(KmerNode kn : forest.array()){ | |
166 if(kn!=null){addRecursive(list, kn);} | |
167 } | |
168 } | |
169 return list; | |
170 } | |
171 | |
172 // public long[] toSketchArray(ArrayList<LongHeap> heaps){ | |
173 // if(heaps.size()==1){return toSketchArray(heaps.get(0));} | |
174 // LongList list=new LongList(size); | |
175 // for(LongHeap heap : heaps){ | |
176 // while(heap.size()>0){list.add(Long.MAX_VALUE-heap.poll());} | |
177 // } | |
178 // list.sort(); | |
179 // list.shrinkToUnique(); | |
180 // list.size=Tools.min(size, list.size); | |
181 // return list.toArray(); | |
182 // } | |
183 | |
184 public Sketch toSketch(ArrayList<SketchHeap> heaps, boolean allowZeroSizeSketch){ | |
185 if(heaps==null || heaps.isEmpty()){ | |
186 if(allowZeroSizeSketch){ | |
187 return new Sketch(new long[0], null, null, null, null, null); | |
188 }else{ | |
189 return null; | |
190 } | |
191 } | |
192 SketchHeap a=heaps.get(0); | |
193 for(int i=1; i<heaps.size(); i++){ | |
194 SketchHeap b=heaps.get(i); | |
195 a.add(b); | |
196 } | |
197 if(verbose2){System.err.println("Creating a sketch of size "+a.size()+".");} | |
198 return new Sketch(a, false, trackCounts, null); | |
199 } | |
200 | |
201 Sketch toSketch(LongList list){ | |
202 list.sort(); | |
203 assert(list.size==0 || list.get(list.size()-1)>=minHashValue) : list.size+", "+list.get(list.size()-1)+", "+minHashValue; | |
204 list.shrinkToUnique(); | |
205 list.reverse(); | |
206 for(int i=0; i<list.size; i++){list.array[i]=Long.MAX_VALUE-list.array[i];} | |
207 return new Sketch(list.toArray(), null, null, null, null, null); | |
208 } | |
209 | |
210 /*--------------------------------------------------------------*/ | |
211 /*---------------- Helpers ----------------*/ | |
212 /*--------------------------------------------------------------*/ | |
213 | |
214 private void addRecursive(SketchHeap heap, KmerNode kn){ | |
215 if(kn==null){return;} | |
216 if(kn.count()>=minKeyOccuranceCount){ | |
217 heap.genomeSizeKmers++; | |
218 long kmer=kn.pivot(); | |
219 long hashcode=hash(kmer); | |
220 if(hashcode>=minHashValue){heap.add(hashcode);} | |
221 } | |
222 if(kn.left()!=null){addRecursive(heap, kn.left());} | |
223 if(kn.right()!=null){addRecursive(heap, kn.right());} | |
224 } | |
225 | |
226 private void addRecursive(LongList list, KmerNode kn){ | |
227 if(kn==null){return;} | |
228 if(kn.count()>=minKeyOccuranceCount){ | |
229 long kmer=kn.pivot(); | |
230 long hashcode=hash(kmer); | |
231 if(hashcode>=minHashValue){list.add(hashcode);} | |
232 } | |
233 if(kn.left()!=null){addRecursive(list, kn.left());} | |
234 if(kn.right()!=null){addRecursive(list, kn.right());} | |
235 } | |
236 | |
237 /*--------------------------------------------------------------*/ | |
238 /*---------------- I/O ----------------*/ | |
239 /*--------------------------------------------------------------*/ | |
240 | |
241 // public static ArrayList<Sketch> loadSketches_ST(String...fnames){ | |
242 // ArrayList<Sketch> sketches=null; | |
243 // for(String s : fnames){ | |
244 // ArrayList<Sketch> temp; | |
245 // if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){ | |
246 // temp=loadSketches(s); | |
247 // }else{ | |
248 // temp=loadSketches_ST(s.split(",")); | |
249 // } | |
250 // if(sketches==null){sketches=temp;} | |
251 // else{sketches.addAll(temp);} | |
252 // } | |
253 // return sketches; | |
254 // } | |
255 | |
256 // public static ArrayList<Sketch> loadSketches_MT(ArrayList<String> fnames){ | |
257 // return loadSketches_MT(0, null, fnames.toArray(new String[0])); | |
258 // } | |
259 | |
260 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, Collection<String> fnames){ | |
261 return loadSketches_MT(params.mode, params.samplerate, params.maxReads, | |
262 params.minEntropy, params.minProb, params.minQual, fnames); | |
263 } | |
264 | |
265 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, String...fnames){ | |
266 return loadSketches_MT(params.mode, params.samplerate, params.maxReads, | |
267 params.minEntropy, params.minProb, params.minQual, fnames); | |
268 } | |
269 | |
270 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, Collection<String> fnames){ | |
271 return loadSketches_MT(mode, samplerate, reads, minEntropy, minProb, minQual, fnames.toArray(new String[0])); | |
272 } | |
273 | |
274 //TODO: This is only multithreaded per file in persequence mode. | |
275 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, String...fnames){ | |
276 | |
277 ConcurrentLinkedQueue<StringNum> decomposedFnames=new ConcurrentLinkedQueue<StringNum>(); | |
278 int num=0; | |
279 for(String s : fnames){ | |
280 if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){ | |
281 num++; | |
282 decomposedFnames.add(new StringNum(s, num)); | |
283 }else{ | |
284 for(String s2 : s.split(",")){ | |
285 num++; | |
286 decomposedFnames.add(new StringNum(s2, num)); | |
287 } | |
288 } | |
289 } | |
290 | |
291 if(decomposedFnames.size()==0){return null;} | |
292 if(decomposedFnames.size()==1){return loadSketchesFromFile(decomposedFnames.poll().s, null, 0, reads, mode, samplerate, minEntropy, minProb, minQual, false);} | |
293 | |
294 //Determine how many threads may be used | |
295 final int threads=Tools.min(Shared.threads(), decomposedFnames.size()); | |
296 | |
297 //Fill a list with LoadThreads | |
298 ArrayList<LoadThread> allt=new ArrayList<LoadThread>(threads); | |
299 | |
300 for(int i=0; i<threads; i++){ | |
301 allt.add(new LoadThread(decomposedFnames, mode, samplerate, reads, minEntropy, minProb, minQual)); | |
302 } | |
303 | |
304 ArrayList<Sketch> sketches=new ArrayList<Sketch>(); | |
305 | |
306 //Start the threads | |
307 for(LoadThread lt : allt){lt.start();} | |
308 | |
309 //Wait for completion of all threads | |
310 boolean success=true; | |
311 for(LoadThread lt : allt){ | |
312 | |
313 //Wait until this thread has terminated | |
314 while(lt.getState()!=Thread.State.TERMINATED){ | |
315 try { | |
316 //Attempt a join operation | |
317 lt.join(); | |
318 } catch (InterruptedException e) { | |
319 //Potentially handle this, if it is expected to occur | |
320 e.printStackTrace(); | |
321 } | |
322 } | |
323 sketches.addAll(lt.list); | |
324 success&=lt.success; | |
325 } | |
326 assert(success) : "Failure loading some files."; | |
327 return sketches; | |
328 } | |
329 | |
330 /*--------------------------------------------------------------*/ | |
331 | |
332 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm, | |
333 int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){ | |
334 return loadSketchesFromFile(fname0, smm, maxThreads, reads, mode, | |
335 params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch); | |
336 } | |
337 | |
338 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm, | |
339 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ | |
340 assert(fname0!=null);//123 | |
341 if(fname0==null){return null;} | |
342 FileFormat ff=FileFormat.testInput(fname0, FileFormat.FASTA, null, false, true); | |
343 if(ff.isSequence()){ | |
344 return loadSketchesFromSequenceFile(ff, smm, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); | |
345 }else{ | |
346 return SketchObject.LOADER2 ? loadSketchesFromSketchFile2(ff, allowZeroSizeSketch) : loadSketchesFromSketchFile(ff, allowZeroSizeSketch); | |
347 } | |
348 } | |
349 | |
350 private ArrayList<Sketch> loadSketchesFromSequenceFile(final FileFormat ff, SketchMakerMini smm, | |
351 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ | |
352 maxThreads=(maxThreads<1 ? Shared.threads() : Tools.min(maxThreads, Shared.threads())); | |
353 | |
354 // assert(false) : (ff.fasta() || ff.fastq() || ff.samOrBam())+", "+ff.fastq()+", "+maxThreads+", "+ | |
355 // allowMultithreadedFastq+", "+forceDisableMultithreadedFastq+", "+(mode==ONE_SKETCH); | |
356 | |
357 if(ff.fastq() && allowMultithreadedFastq && !forceDisableMultithreadedFastq && (mode==ONE_SKETCH || mode==PER_FILE) && | |
358 maxThreads>1 && Shared.threads()>2 && (reads<1 || reads*samplerate*(mergePairs ? 2 : 1)>=1000)){//limit is low due to PacBio long reads | |
359 | |
360 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; | |
361 Read.VALIDATE_IN_CONSTRUCTOR=false; | |
362 | |
363 if(verbose2){System.err.println("Loading a sketch multithreaded.");} | |
364 Sketch sketch=processReadsMT(ff.name(), maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); | |
365 | |
366 Read.VALIDATE_IN_CONSTRUCTOR=vic; | |
367 | |
368 ArrayList<Sketch> list=new ArrayList<Sketch>(1); | |
369 if(sketch!=null && (sketch.length()>0 || allowZeroSizeSketch)){ | |
370 sketch.loadSSU(); | |
371 list.add(sketch); | |
372 } | |
373 return list; | |
374 } | |
375 if(smm==null){smm=new SketchMakerMini(this, mode, minEntropy, minProb, minQual);} | |
376 if(verbose2){System.err.println("Loading sketches via SMM.");} | |
377 ArrayList<Sketch> sketches=smm.toSketches(ff.name(), samplerate, reads); | |
378 if(verbose2){System.err.println("Loaded "+(sketches==null ? 0 : sketches.size())+" sketches via SMM.");} | |
379 return sketches; | |
380 } | |
381 | |
382 private ArrayList<Sketch> loadSketchesFromSketchFile(final FileFormat ff, boolean allowZeroSizeSketch){ | |
383 | |
384 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false; | |
385 | |
386 if(verbose2){System.err.println("Loading sketches from text.");} | |
387 ArrayList<Sketch> sketches=new ArrayList<Sketch>(); | |
388 ByteFile bf=ByteFile.makeByteFile(ff); | |
389 int currentSketchSize=stTargetSketchSize; | |
390 int taxID=-1; | |
391 long spid=-1; | |
392 long imgID=-1; | |
393 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0; | |
394 long[] baseCounts=null; | |
395 byte[] r16S=null; | |
396 byte[] r18S=null; | |
397 int r16SLen=0; | |
398 int r18SLen=0; | |
399 float probCorrect=-1; | |
400 String name=null, name0=null, fname=ff.simpleName(); | |
401 LongList list=null; | |
402 IntList countList=null; | |
403 ArrayList<String> meta=null; | |
404 long sum=0; | |
405 byte[] lastHeader=null; | |
406 | |
407 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ | |
408 if(line.length>0){ | |
409 // System.err.println("Processing line "+new String(line)); | |
410 if(line[0]=='#'){ | |
411 if(r16SLen>0 || r18SLen>0){ | |
412 byte[] ssu=KillSwitch.copyOfRange(line, 5, line.length); | |
413 if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){ | |
414 assert(r16SLen>0); | |
415 assert(ssu.length==r16SLen) : r16SLen+", "+line.length+"\n"+new String(line)+"\n"; | |
416 r16S=ssu; | |
417 r16SLen=0; | |
418 }else if(Tools.startsWith(line, "#18S:")){ | |
419 assert(r18SLen>0); | |
420 assert(ssu.length==r18SLen) : r18SLen+", "+line.length+"\n"+new String(line)+"\n"; | |
421 r18S=ssu; | |
422 r18SLen=0; | |
423 }else{ | |
424 assert(false) : new String(line); | |
425 } | |
426 }else{ | |
427 lastHeader=line; | |
428 if(list!=null){ | |
429 assert(list.size==list.array.length); | |
430 if(NUC || unsorted){ | |
431 list.sort(); | |
432 list.shrinkToUnique(); | |
433 }else{ | |
434 list.shrink(); | |
435 } | |
436 if(list.size()>0 || allowZeroSizeSketch){ | |
437 int[] keyCounts=countList==null ? null : countList.array; | |
438 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, | |
439 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); | |
440 sketch.spid=spid; | |
441 sketches.add(sketch); | |
442 } | |
443 // System.err.println("Made sketch "+sketch); | |
444 } | |
445 name=name0=null; | |
446 fname=ff.simpleName(); | |
447 list=null; | |
448 countList=null; | |
449 meta=null; | |
450 baseCounts=null; | |
451 r16S=null; | |
452 r18S=null; | |
453 r16SLen=0; | |
454 r18SLen=0; | |
455 sum=0; | |
456 taxID=-1; | |
457 imgID=-1; | |
458 genomeSizeBases=0; | |
459 genomeSizeKmers=0; | |
460 genomeSequences=0; | |
461 probCorrect=-1; | |
462 int k_sketch=defaultK; | |
463 int k2_sketch=0; | |
464 int hashVersion_sketch=1; | |
465 | |
466 if(line.length>1){ | |
467 String[] split=new String(line, 1, line.length-1).split("\t"); | |
468 for(String s : split){ | |
469 final int colon=s.indexOf(':'); | |
470 final String sub=s.substring(colon+1); | |
471 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length | |
472 currentSketchSize=Integer.parseInt(sub); | |
473 }else if(s.startsWith("CD:")){//Coding | |
474 A48=HEX=NUC=delta=counts=unsorted=false; | |
475 | |
476 for(int i=0; i<sub.length(); i++){ | |
477 char c=sub.charAt(i); | |
478 if(c=='A'){A48=true;} | |
479 else if(c=='H'){HEX=true;} | |
480 else if(c=='R'){A48=HEX=false;} | |
481 else if(c=='N'){NUC=true;} | |
482 else if(c=='D'){delta=true;} | |
483 else if(c=='C'){counts=true;} | |
484 else if(c=='U'){unsorted=true;} | |
485 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);} | |
486 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);} | |
487 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);} | |
488 } | |
489 }else if(s.startsWith("K:")){//Kmer length | |
490 if(sub.indexOf(',')>=0){ | |
491 String[] subsplit=sub.split(","); | |
492 assert(subsplit.length==2) : "Bad header component "+s; | |
493 int x=Integer.parseInt(subsplit[0]); | |
494 int y=Integer.parseInt(subsplit[1]); | |
495 k_sketch=Tools.max(x, y); | |
496 k2_sketch=Tools.min(x, y); | |
497 }else{ | |
498 k_sketch=Integer.parseInt(sub); | |
499 k2_sketch=0; | |
500 } | |
501 }else if(s.startsWith("H:")){//Hash version | |
502 hashVersion_sketch=Integer.parseInt(sub); | |
503 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN | |
504 baseCounts=Parse.parseLongArray(sub); | |
505 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases | |
506 genomeSizeBases=Long.parseLong(sub); | |
507 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers | |
508 genomeSizeKmers=Long.parseLong(sub); | |
509 }else if(s.startsWith("GQ:")){ | |
510 genomeSequences=Long.parseLong(sub); | |
511 }else if(s.startsWith("GE:")){//Genome size estimate kmers | |
512 //ignore | |
513 }else if(s.startsWith("PC:")){//Probability of correctness | |
514 probCorrect=Float.parseFloat(sub); | |
515 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){ | |
516 taxID=Integer.parseInt(sub); | |
517 }else if(s.startsWith("IMG:")){ | |
518 imgID=Long.parseLong(sub); | |
519 }else if(s.startsWith("SPID:")){ | |
520 spid=Integer.parseInt(sub); | |
521 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){ | |
522 name=sub; | |
523 }else if(s.startsWith("FN:")){ | |
524 fname=sub; | |
525 }else if(s.startsWith("NM0:")){ | |
526 name0=sub; | |
527 }else if(s.startsWith("MT_")){ | |
528 if(meta==null){meta=new ArrayList<String>(1);} | |
529 meta.add(s.substring(3)); | |
530 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){ | |
531 r16SLen=Integer.parseInt(sub); | |
532 }else if(s.startsWith("18S:")){ | |
533 r18SLen=Integer.parseInt(sub); | |
534 }else{ | |
535 assert(false) : "Unsupported header tag "+s; | |
536 } | |
537 } | |
538 } | |
539 | |
540 if(KILL_OK){ | |
541 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+ | |
542 " differs from loaded kmer length "+k+"\n"+new String(line)+"\nfile: "+ff.name());} | |
543 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+ | |
544 " differs from loaded kmer length "+k+","+k2+"\n"+new String(line)+"\nfile: "+ff.name());} | |
545 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+ | |
546 " differs from loaded hash version "+HASH_VERSION+".\n" | |
547 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\nfile: "+ff.name());} | |
548 }else{//Potential hang | |
549 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line); | |
550 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line); | |
551 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+ | |
552 " differs from loaded hash version "+HASH_VERSION+".\n" | |
553 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n"; | |
554 } | |
555 if(currentSketchSize>0 || allowZeroSizeSketch){ | |
556 list=new LongList(Tools.max(1, currentSketchSize)); | |
557 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));} | |
558 } | |
559 } | |
560 }else{ | |
561 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) : | |
562 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Parse.parseLong(line)); | |
563 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x)); | |
564 sum+=x; | |
565 assert(x>=0 || NUC) : "x="+x+"\nline="+new String(line)+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\nlineNum="+bf.lineNum()+"\n"; | |
566 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it."; | |
567 assert(list!=null) : new String(line); | |
568 long key=(delta ? sum : x); | |
569 if(key>=0){list.add(key);} | |
570 } | |
571 } | |
572 } | |
573 if(list!=null && (list.size>0 || allowZeroSizeSketch)){ | |
574 assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : list.size+"!="+list.array.length; | |
575 if(NUC || unsorted){ | |
576 list.sort(); | |
577 list.shrinkToUnique(); | |
578 }else{ | |
579 list.shrink(); | |
580 } | |
581 int[] keyCounts=countList==null ? null : countList.array; | |
582 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, | |
583 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); | |
584 sketch.spid=spid; | |
585 sketches.add(sketch); | |
586 } | |
587 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");} | |
588 return sketches; | |
589 } | |
590 | |
591 /** Usually much faster due to not manifesting the multithreaded load Java slowdown. Should incur less garbage collection also. */ | |
592 private ArrayList<Sketch> loadSketchesFromSketchFile2(final FileFormat ff, boolean allowZeroSizeSketch){ | |
593 | |
594 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false; | |
595 | |
596 if(verbose2){System.err.println("Loading sketches from text.");} | |
597 ArrayList<Sketch> sketches=new ArrayList<Sketch>(); | |
598 | |
599 InputStream is=ReadWrite.getInputStream(ff.name(), BUFFERED_READER, false); | |
600 byte[] buffer=new byte[BUFLEN]; | |
601 int start, limit=0; | |
602 try { | |
603 limit=is.read(buffer); | |
604 } catch (IOException e) { | |
605 KillSwitch.exceptionKill(e); | |
606 } | |
607 | |
608 | |
609 int currentSketchSize=stTargetSketchSize; | |
610 int taxID=-1; | |
611 long spid=-1; | |
612 long imgID=-1; | |
613 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0; | |
614 long[] baseCounts=null; | |
615 byte[] r16S=null; | |
616 byte[] r18S=null; | |
617 int r16SLen=0; | |
618 int r18SLen=0; | |
619 float probCorrect=-1; | |
620 String name=null, name0=null, fname=ff.simpleName(); | |
621 LongList list=null; | |
622 IntList countList=null; | |
623 ArrayList<String> meta=null; | |
624 long sum=0; | |
625 byte[] lastHeader=null; | |
626 ByteBuilder bb=new ByteBuilder(256); | |
627 | |
628 for(start=0; start<limit;){ | |
629 // System.err.println("Processing line "+new String(line)); | |
630 if(buffer[start]=='#'){ | |
631 bb.clear(); | |
632 try { | |
633 while(buffer[start]!='\n'){ | |
634 bb.append(buffer[start]); | |
635 start++; | |
636 if(start>=limit){start=0; limit=is.read(buffer);} | |
637 } | |
638 } catch (IOException e) { | |
639 KillSwitch.exceptionKill(e); | |
640 } | |
641 start++; | |
642 | |
643 if(r16SLen>0 || r18SLen>0){ | |
644 byte[] ssu=bb.toBytes(5, bb.length()); | |
645 if(bb.startsWith("#16S:") || bb.startsWith("#SSU:")){ | |
646 assert(r16SLen>0); | |
647 assert(ssu.length==r16SLen) : r16SLen+", "+bb.length+"\n"+bb+"\n"; | |
648 r16S=ssu; | |
649 r16SLen=0; | |
650 }else if(bb.startsWith("#18S:")){ | |
651 assert(r18SLen>0); | |
652 assert(ssu.length==r18SLen) : r18SLen+", "+bb.length+"\n"+bb+"\n"; | |
653 r18S=ssu; | |
654 r18SLen=0; | |
655 }else{ | |
656 assert(false) : bb; | |
657 } | |
658 }else{ | |
659 | |
660 // byte[] line=lastHeader=bb.toBytes(); | |
661 if(list!=null){ | |
662 | |
663 //This assertion fails sometimes for Silva per-sequence mode, but it's not important | |
664 // assert(list.size==list.array.length) : list.size+", "+list.array.length+(lastHeader==null ? "" : ", "+new String(lastHeader)); | |
665 | |
666 if(NUC || unsorted){ | |
667 list.sort(); | |
668 list.shrinkToUnique(); | |
669 }else{ | |
670 list.shrink(); | |
671 } | |
672 if(list.size()>0 || allowZeroSizeSketch){ | |
673 int[] keyCounts=countList==null ? null : countList.array; | |
674 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, | |
675 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); | |
676 sketch.spid=spid; | |
677 sketches.add(sketch); | |
678 } | |
679 // System.err.println("Made sketch "+sketch); | |
680 } | |
681 name=name0=null; | |
682 fname=ff.simpleName(); | |
683 list=null; | |
684 countList=null; | |
685 meta=null; | |
686 baseCounts=null; | |
687 r16S=null; | |
688 r18S=null; | |
689 r16SLen=0; | |
690 r18SLen=0; | |
691 sum=0; | |
692 taxID=-1; | |
693 imgID=-1; | |
694 genomeSizeBases=0; | |
695 genomeSizeKmers=0; | |
696 genomeSequences=0; | |
697 probCorrect=-1; | |
698 int k_sketch=defaultK; | |
699 int k2_sketch=0; | |
700 int hashVersion_sketch=1; | |
701 | |
702 if(bb.length>1){ | |
703 String[] split=new String(bb.array, 1, bb.length-1).split("\t"); | |
704 for(String s : split){ | |
705 final int colon=s.indexOf(':'); | |
706 final String sub=s.substring(colon+1); | |
707 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length | |
708 currentSketchSize=Integer.parseInt(sub); | |
709 }else if(s.startsWith("CD:")){//Coding | |
710 A48=HEX=NUC=delta=counts=unsorted=false; | |
711 | |
712 for(int i=0; i<sub.length(); i++){ | |
713 char c=sub.charAt(i); | |
714 if(c=='A'){A48=true;} | |
715 else if(c=='H'){HEX=true;} | |
716 else if(c=='R'){A48=HEX=false;} | |
717 else if(c=='N'){NUC=true;} | |
718 else if(c=='D'){delta=true;} | |
719 else if(c=='C'){counts=true;} | |
720 else if(c=='U'){unsorted=true;} | |
721 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+bb;} | |
722 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+bb;} | |
723 else{assert(false) : "Unknown coding symbol: "+c+"\t"+bb;} | |
724 } | |
725 }else if(s.startsWith("K:")){//Kmer length | |
726 if(sub.indexOf(',')>=0){ | |
727 String[] subsplit=sub.split(","); | |
728 assert(subsplit.length==2) : "Bad header component "+s; | |
729 int x=Integer.parseInt(subsplit[0]); | |
730 int y=Integer.parseInt(subsplit[1]); | |
731 k_sketch=Tools.max(x, y); | |
732 k2_sketch=Tools.min(x, y); | |
733 }else{ | |
734 k_sketch=Integer.parseInt(sub); | |
735 k2_sketch=0; | |
736 } | |
737 }else if(s.startsWith("H:")){//Hash version | |
738 hashVersion_sketch=Integer.parseInt(sub); | |
739 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN | |
740 baseCounts=Parse.parseLongArray(sub); | |
741 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases | |
742 genomeSizeBases=Long.parseLong(sub); | |
743 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers | |
744 genomeSizeKmers=Long.parseLong(sub); | |
745 }else if(s.startsWith("GQ:")){ | |
746 genomeSequences=Long.parseLong(sub); | |
747 }else if(s.startsWith("GE:")){//Genome size estimate kmers | |
748 //ignore | |
749 }else if(s.startsWith("PC:")){//Probability of correctness | |
750 probCorrect=Float.parseFloat(sub); | |
751 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){ | |
752 taxID=Integer.parseInt(sub); | |
753 }else if(s.startsWith("IMG:")){ | |
754 imgID=Long.parseLong(sub); | |
755 }else if(s.startsWith("SPID:")){ | |
756 spid=Integer.parseInt(sub); | |
757 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){ | |
758 name=sub; | |
759 }else if(s.startsWith("FN:")){ | |
760 fname=sub; | |
761 }else if(s.startsWith("NM0:")){ | |
762 name0=sub; | |
763 }else if(s.startsWith("MT_")){ | |
764 if(meta==null){meta=new ArrayList<String>(1);} | |
765 meta.add(s.substring(3)); | |
766 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){ | |
767 r16SLen=Integer.parseInt(sub); | |
768 }else if(s.startsWith("18S:")){ | |
769 r18SLen=Integer.parseInt(sub); | |
770 }else{ | |
771 assert(false) : "Unsupported header tag "+s; | |
772 } | |
773 } | |
774 } | |
775 | |
776 if(KILL_OK){ | |
777 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+ | |
778 " differs from loaded kmer length "+k+"\n"+bb+"\nfile: "+ff.name());} | |
779 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+ | |
780 " differs from loaded kmer length "+k+","+k2+"\n"+bb+"\nfile: "+ff.name());} | |
781 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+ | |
782 " differs from loaded hash version "+HASH_VERSION+".\n" | |
783 + "You may need to download the latest version of BBTools.\nfile: "+ff.name());} | |
784 }else{//Potential hang | |
785 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+bb; | |
786 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+bb; | |
787 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+ | |
788 " differs from loaded hash version "+HASH_VERSION+".\n" | |
789 + "You may need to download the latest version of BBTools.\nfile: "+ff.name(); | |
790 } | |
791 if(currentSketchSize>0 || allowZeroSizeSketch){ | |
792 list=new LongList(Tools.max(1, currentSketchSize)); | |
793 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));} | |
794 } | |
795 } | |
796 }else{ | |
797 bb.clear(); | |
798 try { | |
799 while(buffer[start]!='\n'){ | |
800 bb.append(buffer[start]); | |
801 start++; | |
802 if(start>=limit){start=0; limit=is.read(buffer);} | |
803 } | |
804 } catch (IOException e) { | |
805 KillSwitch.exceptionKill(e); | |
806 } | |
807 start++; | |
808 | |
809 long x=(counts ? Sketch.parseA48C(bb, countList) : A48 ? Sketch.parseA48(bb) : | |
810 HEX ? Sketch.parseHex(bb) : NUC ? Sketch.parseNuc(bb) : Parse.parseLong(bb.array, 0, bb.length)); | |
811 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x)); | |
812 sum+=x; | |
813 assert(x>=0 || NUC) : "x="+x+"\nline="+bb+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\n"; | |
814 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it."; | |
815 assert(list!=null) : bb; | |
816 long key=(delta ? sum : x); | |
817 if(key>=0){list.add(key);} | |
818 } | |
819 | |
820 if(start>=limit){ | |
821 start=0; | |
822 try { | |
823 limit=is.read(buffer); | |
824 } catch (IOException e) { | |
825 KillSwitch.exceptionKill(e); | |
826 } | |
827 } | |
828 } | |
829 if(list!=null && (list.size>0 || allowZeroSizeSketch)){ | |
830 | |
831 //This assertion fails sometimes for Silva per-sequence mode, but it's not important | |
832 // assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : | |
833 // list.size+"!="+list.array.length+(lastHeader==null ? "" : "\n"+new String(lastHeader)); | |
834 | |
835 if(NUC || unsorted){ | |
836 list.sort(); | |
837 list.shrinkToUnique(); | |
838 }else{ | |
839 list.shrink(); | |
840 } | |
841 int[] keyCounts=countList==null ? null : countList.array; | |
842 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, | |
843 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); | |
844 sketch.spid=spid; | |
845 sketches.add(sketch); | |
846 } | |
847 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");} | |
848 try { | |
849 is.close(); | |
850 } catch (IOException e) { | |
851 e.printStackTrace(); | |
852 } | |
853 return sketches; | |
854 } | |
855 | |
856 /*--------------------------------------------------------------*/ | |
857 | |
858 public ArrayList<Sketch> loadSketchesFromString(String sketchString){ | |
859 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false; | |
860 | |
861 ArrayList<Sketch> sketches=new ArrayList<Sketch>(); | |
862 int currentSketchSize=stTargetSketchSize; | |
863 int taxID=-1; | |
864 long spid=-1; | |
865 long imgID=-1; | |
866 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0; | |
867 long[] baseCounts=null; | |
868 byte[] r16S=null; | |
869 byte[] r18S=null; | |
870 int r16SLen=0; | |
871 int r18SLen=0; | |
872 float probCorrect=-1; | |
873 String name=null, name0=null, fname=null; | |
874 LongList list=null; | |
875 IntList countList=null; | |
876 ArrayList<String> meta=null; | |
877 long sum=0; | |
878 | |
879 String[] split0=sketchString.split("\n"); | |
880 for(String line : split0){ | |
881 if(line.length()>0){ | |
882 // System.err.println("Processing line "+new String(line)); | |
883 if(line.charAt(0)=='#'){ | |
884 if(line.length()>1 && line.charAt(1)=='#'){ | |
885 //ignore | |
886 }else if(r16SLen>0 || r18SLen>0){ | |
887 byte[] ssu=KillSwitch.copyOfRange(line.getBytes(), 5, line.length()); | |
888 if(line.startsWith("#16S:") || line.startsWith("#SSU:")){ | |
889 assert(r16SLen>0); | |
890 assert(ssu.length==r16SLen) : r16SLen+", "+line.length()+"\n"+line+"\n"; | |
891 r16S=ssu; | |
892 r16SLen=0; | |
893 }else if(line.startsWith("#18S:")){ | |
894 assert(r18SLen>0); | |
895 assert(ssu.length==r18SLen) : r18SLen+", "+line.length()+"\n"+line+"\n"; | |
896 r18S=ssu; | |
897 r18SLen=0; | |
898 }else{ | |
899 assert(false) : line; | |
900 } | |
901 }else{ | |
902 if(list!=null){ | |
903 assert(list.size==list.array.length); | |
904 if(NUC || unsorted){ | |
905 list.sort(); | |
906 list.shrinkToUnique(); | |
907 }else{ | |
908 list.shrink(); | |
909 } | |
910 if(list.size()>0){ | |
911 int[] keyCounts=countList==null ? null : countList.array; | |
912 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, | |
913 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); | |
914 sketch.spid=spid; | |
915 sketches.add(sketch); | |
916 } | |
917 // System.err.println("Made sketch "+sketch); | |
918 } | |
919 name=name0=null; | |
920 fname=null; | |
921 list=null; | |
922 countList=null; | |
923 meta=null; | |
924 baseCounts=null; | |
925 r16S=null; | |
926 r18S=null; | |
927 r16SLen=0; | |
928 r18SLen=0; | |
929 sum=0; | |
930 taxID=-1; | |
931 imgID=-1; | |
932 genomeSizeBases=0; | |
933 genomeSizeKmers=0; | |
934 genomeSequences=0; | |
935 probCorrect=-1; | |
936 int k_sketch=defaultK; | |
937 int k2_sketch=0; | |
938 int hashVersion_sketch=1; | |
939 | |
940 if(line.length()>1){ | |
941 String[] split=line.substring(1).split("\t"); | |
942 for(String s : split){ | |
943 final int colon=s.indexOf(':'); | |
944 final String sub=s.substring(colon+1); | |
945 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length | |
946 currentSketchSize=Integer.parseInt(sub); | |
947 }else if(s.startsWith("CD:")){//Coding | |
948 A48=HEX=NUC=delta=counts=false; | |
949 | |
950 for(int i=0; i<sub.length(); i++){ | |
951 char c=sub.charAt(i); | |
952 if(c=='A'){A48=true;} | |
953 else if(c=='H'){HEX=true;} | |
954 else if(c=='R'){A48=HEX=false;} | |
955 else if(c=='N'){NUC=true;} | |
956 else if(c=='D'){delta=true;} | |
957 else if(c=='C'){counts=true;} | |
958 else if(c=='U'){unsorted=true;} | |
959 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);} | |
960 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);} | |
961 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);} | |
962 } | |
963 | |
964 }else if(s.startsWith("K:")){//Kmer length | |
965 if(sub.indexOf(',')>=0){ | |
966 String[] subsplit=sub.split(","); | |
967 assert(subsplit.length==2) : "Bad header component "+s; | |
968 int x=Integer.parseInt(subsplit[0]); | |
969 int y=Integer.parseInt(subsplit[1]); | |
970 k_sketch=Tools.max(x, y); | |
971 k2_sketch=Tools.min(x, y); | |
972 }else{ | |
973 k_sketch=Integer.parseInt(s); | |
974 k2_sketch=0; | |
975 } | |
976 }else if(s.startsWith("H:")){//Hash version | |
977 hashVersion_sketch=Integer.parseInt(sub); | |
978 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN | |
979 baseCounts=Parse.parseLongArray(sub); | |
980 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases | |
981 genomeSizeBases=Long.parseLong(sub); | |
982 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers | |
983 genomeSizeKmers=Long.parseLong(sub); | |
984 }else if(s.startsWith("GQ:")){ | |
985 genomeSequences=Long.parseLong(sub); | |
986 }else if(s.startsWith("GE:")){//Genome size estimate kmers | |
987 //ignore | |
988 }else if(s.startsWith("PC:")){//Probability of correctness | |
989 probCorrect=Float.parseFloat(sub); | |
990 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){ | |
991 taxID=Integer.parseInt(sub); | |
992 }else if(s.startsWith("IMG:")){ | |
993 imgID=Long.parseLong(sub); | |
994 }else if(s.startsWith("SPID:")){ | |
995 spid=Integer.parseInt(sub); | |
996 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){ | |
997 name=sub; | |
998 }else if(s.startsWith("FN:")){ | |
999 fname=sub; | |
1000 }else if(s.startsWith("NM0:")){ | |
1001 name0=sub; | |
1002 }else if(s.startsWith("MT_")){ | |
1003 if(meta==null){meta=new ArrayList<String>(1);} | |
1004 meta.add(s.substring(3)); | |
1005 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){ | |
1006 r16SLen=Integer.parseInt(sub); | |
1007 }else if(s.startsWith("18S:")){ | |
1008 r18SLen=Integer.parseInt(sub); | |
1009 }else{ | |
1010 assert(false) : "Unsupported header tag "+s; | |
1011 } | |
1012 } | |
1013 } | |
1014 if(KILL_OK){ | |
1015 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line));} | |
1016 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line));} | |
1017 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+ | |
1018 " differs from loaded hash version "+HASH_VERSION+".\n" | |
1019 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n");} | |
1020 }else{//Potential hang | |
1021 assert(k_sketch==k && !NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line); | |
1022 assert(k2_sketch==k2 && !NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line); | |
1023 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+ | |
1024 " differs from loaded hash version "+HASH_VERSION+".\n" | |
1025 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n"; | |
1026 } | |
1027 | |
1028 | |
1029 if(currentSketchSize>0){ | |
1030 list=new LongList(currentSketchSize); | |
1031 if(counts){countList=new IntList(currentSketchSize);} | |
1032 } | |
1033 } | |
1034 }else{ | |
1035 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) : | |
1036 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Long.parseLong(line)); | |
1037 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x)); | |
1038 sum+=x; | |
1039 assert(x>=0 || NUC) : x+"\n"+new String(line); | |
1040 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it."; | |
1041 long key=(delta ? sum : x); | |
1042 if(key>=0){list.add(key);} | |
1043 } | |
1044 } | |
1045 } | |
1046 | |
1047 if(list!=null){ | |
1048 assert(list.size==list.array.length || list.size()==0 || NUC || unsorted); | |
1049 if(NUC || unsorted){ | |
1050 list.sort(); | |
1051 list.shrinkToUnique(); | |
1052 }else{ | |
1053 list.shrink(); | |
1054 } | |
1055 int[] keyCounts=countList==null ? null : countList.array; | |
1056 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID, | |
1057 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta); | |
1058 sketch.spid=spid; | |
1059 sketches.add(sketch); | |
1060 } | |
1061 return sketches; | |
1062 } | |
1063 | |
1064 /*--------------------------------------------------------------*/ | |
1065 /*---------------- Threads ----------------*/ | |
1066 /*--------------------------------------------------------------*/ | |
1067 | |
1068 | |
1069 | |
1070 private class LoadThread extends Thread{ | |
1071 | |
1072 public LoadThread(ConcurrentLinkedQueue<StringNum> queue_, int mode_, float samplerate_, long reads_, float minEntropy, float minProb, byte minQual) { | |
1073 queue=queue_; | |
1074 list=new ArrayList<Sketch>(); | |
1075 smm=new SketchMakerMini(SketchTool.this, mode_, minEntropy, minProb, minQual); | |
1076 samplerate=samplerate_; | |
1077 reads=reads_; | |
1078 } | |
1079 | |
1080 @Override | |
1081 public void run(){ | |
1082 success=false; | |
1083 for(StringNum sn=queue.poll(); sn!=null; sn=queue.poll()){ | |
1084 ArrayList<Sketch> temp=null; | |
1085 try { | |
1086 temp=loadSketchesFromFile(sn.s, smm, 1, reads, smm.mode, samplerate, smm.minEntropy(), smm.minProb(), smm.minQual(), false); | |
1087 } catch (Throwable e) { | |
1088 System.err.println("Failure loading "+sn+":\n"+e); | |
1089 e.printStackTrace(); | |
1090 success=false; | |
1091 } | |
1092 if(temp!=null && temp.size()>0){ | |
1093 if(smm.mode==PER_FILE){ | |
1094 // assert(temp.size()==1) : temp.size(); | |
1095 temp.get(0).sketchID=(int)sn.n; | |
1096 } | |
1097 for(Sketch s : temp){add(s);} | |
1098 } | |
1099 } | |
1100 success=true; | |
1101 } | |
1102 | |
1103 private void add(Sketch s){ | |
1104 if(list!=null){ | |
1105 list.add(s); | |
1106 return; | |
1107 } | |
1108 assert(false) : "Unsupported."; //The map logic is broken; needs to be synchronized. | |
1109 // if(s.taxID<0){return;} | |
1110 //// assert(s.taxID>-1) : s.toHeader(); | |
1111 // TaxNode tn=tree.getNode(s.taxID); | |
1112 // while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){ | |
1113 // TaxNode temp=tree.getNode(tn.pid); | |
1114 // if(temp==null){break;} | |
1115 // tn=temp; | |
1116 // } | |
1117 // if(tn==null){return;} | |
1118 // Integer key=tn.id; | |
1119 // Sketch old=map.get(key); | |
1120 // if(old==null){ | |
1121 // s.taxID=key; | |
1122 // map.put(key, s); | |
1123 // }else{ | |
1124 // synchronized(old){ | |
1125 // old.add(s, maxLen); | |
1126 // } | |
1127 // } | |
1128 } | |
1129 | |
1130 final ConcurrentLinkedQueue<StringNum> queue; | |
1131 ArrayList<Sketch> list; | |
1132 boolean success=false; | |
1133 final SketchMakerMini smm; | |
1134 final float samplerate; | |
1135 final long reads; | |
1136 | |
1137 // ConcurrentHashMap<Integer, Sketch> map; | |
1138 | |
1139 } | |
1140 | |
1141 private class LoadThread2 extends Thread{ | |
1142 | |
1143 LoadThread2(ConcurrentReadInputStream cris_, float minEntropy, float minProb, byte minQual){ | |
1144 cris=cris_; | |
1145 smm=new SketchMakerMini(SketchTool.this, ONE_SKETCH, minEntropy, minProb, minQual); | |
1146 } | |
1147 | |
1148 @Override | |
1149 public void run(){ | |
1150 | |
1151 //Grab the first ListNum of reads | |
1152 ListNum<Read> ln=cris.nextList(); | |
1153 //Grab the actual read list from the ListNum | |
1154 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
1155 | |
1156 //As long as there is a nonempty read list... | |
1157 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
1158 | |
1159 //Loop through each read in the list | |
1160 for(int idx=0; idx<reads.size(); idx++){ | |
1161 final Read r1=reads.get(idx); | |
1162 final Read r2=r1.mate; | |
1163 | |
1164 if(validate){ | |
1165 if(r1!=null){r1.validate(true);} | |
1166 if(r2!=null){r2.validate(true);} | |
1167 } | |
1168 | |
1169 smm.processReadPair(r1, r2); | |
1170 } | |
1171 | |
1172 //Notify the input stream that the list was used | |
1173 cris.returnList(ln); | |
1174 | |
1175 //Fetch a new list | |
1176 ln=cris.nextList(); | |
1177 reads=(ln!=null ? ln.list : null); | |
1178 } | |
1179 | |
1180 //Notify the input stream that the final list was used | |
1181 if(ln!=null){ | |
1182 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); | |
1183 } | |
1184 } | |
1185 | |
1186 private final boolean validate=!Read.VALIDATE_IN_CONSTRUCTOR; | |
1187 ConcurrentReadInputStream cris; | |
1188 SketchMakerMini smm; | |
1189 | |
1190 } | |
1191 | |
1192 /** Converts KmerTableSets to Heaps */ | |
1193 private class SketchThread extends Thread { | |
1194 | |
1195 SketchThread(AtomicInteger next_, KmerTableSet kts_){ | |
1196 next=next_; | |
1197 kts=kts_; | |
1198 } | |
1199 | |
1200 @Override | |
1201 public void run(){ | |
1202 final int ways=kts.ways(); | |
1203 int tnum=next.getAndIncrement(); | |
1204 while(tnum<ways){ | |
1205 HashArray1D table=kts.getTable(tnum); | |
1206 if(stTargetSketchSize>0){ | |
1207 if(heap==null){heap=new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts);} | |
1208 toHeap(table, heap); | |
1209 }else{ | |
1210 if(list==null){list=new LongList();} | |
1211 toList(table, list); | |
1212 } | |
1213 tnum=next.getAndIncrement(); | |
1214 } | |
1215 } | |
1216 | |
1217 final AtomicInteger next; | |
1218 final KmerTableSet kts; | |
1219 SketchHeap heap; | |
1220 LongList list; | |
1221 } | |
1222 | |
1223 /*--------------------------------------------------------------*/ | |
1224 /*---------------- Read Loading ----------------*/ | |
1225 /*--------------------------------------------------------------*/ | |
1226 | |
1227 public Sketch processReadsMT(String fname, int maxThreads, long reads, int mode, float samplerate, | |
1228 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ | |
1229 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){ | |
1230 return processReadsMT(fname.replaceFirst("#", "1"), fname.replaceFirst("#", "2"), maxThreads, reads, mode, samplerate, | |
1231 minEntropy, minProb, minQual, allowZeroSizeSketch); | |
1232 }else{ | |
1233 return processReadsMT(fname, null, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); | |
1234 } | |
1235 } | |
1236 | |
1237 public Sketch processReadsMT(String fname1, String fname2, int maxThreads, long reads, int mode, float samplerate, | |
1238 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ | |
1239 final FileFormat ffin1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true); | |
1240 final FileFormat ffin2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true); | |
1241 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch); | |
1242 } | |
1243 | |
1244 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){ | |
1245 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch); | |
1246 } | |
1247 | |
1248 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, float samplerate, | |
1249 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){ | |
1250 assert(mode==ONE_SKETCH || mode==PER_FILE); | |
1251 final boolean compressed=ffin1.compressed(); | |
1252 | |
1253 maxThreads=Tools.mid(1, maxThreads, Shared.threads()); | |
1254 | |
1255 //Create a read input stream | |
1256 final ConcurrentReadInputStream cris; | |
1257 String simpleName; | |
1258 { | |
1259 simpleName=ffin1.simpleName(); | |
1260 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null); | |
1261 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);} | |
1262 cris.start(); //Start the stream | |
1263 // if(verbose){outstream.println("Started cris");} | |
1264 } | |
1265 | |
1266 //TODO: bgzip actually decompresses fast. | |
1267 final int threads=(int)Tools.min(maxThreads, | |
1268 1.75f*(compressed ? 1 : 2)*(ffin2==null ? 4 : 8)*(mergePairs ? 3 : minEntropy>0 ? 2 : 1)); | |
1269 | |
1270 if(verbose2){System.err.println("Starting "+threads+" load threads.");} | |
1271 ArrayList<LoadThread2> list=new ArrayList<LoadThread2>(threads); | |
1272 for(int i=0; i<threads; i++){ | |
1273 list.add(new LoadThread2(cris, minEntropy, minProb, minQual)); | |
1274 list.get(i).start(); | |
1275 } | |
1276 | |
1277 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads); | |
1278 | |
1279 for(LoadThread2 pt : list){ | |
1280 | |
1281 //Wait until this thread has terminated | |
1282 while(pt.getState()!=Thread.State.TERMINATED){ | |
1283 try { | |
1284 //Attempt a join operation | |
1285 pt.join(); | |
1286 } catch (InterruptedException e) { | |
1287 //Potentially handle this, if it is expected to occur | |
1288 e.printStackTrace(); | |
1289 } | |
1290 } | |
1291 | |
1292 if(pt.smm.heap!=null && pt.smm.heap.size()>0){ | |
1293 heaps.add(pt.smm.heap); | |
1294 } | |
1295 } | |
1296 list.clear(); | |
1297 ReadWrite.closeStream(cris); | |
1298 | |
1299 if(verbose2){System.err.println("Generating a sketch by combining thread output.");} | |
1300 Sketch sketch=toSketch(heaps, allowZeroSizeSketch); | |
1301 if(verbose2){System.err.println("Resulting sketch: "+((sketch==null) ? "null" : "length="+sketch.length()));} | |
1302 if(sketch!=null){sketch.setFname(simpleName);} | |
1303 return sketch; | |
1304 } | |
1305 | |
1306 /*--------------------------------------------------------------*/ | |
1307 /*---------------- Writing ----------------*/ | |
1308 /*--------------------------------------------------------------*/ | |
1309 | |
1310 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff[]){ | |
1311 final int len=ff.length; | |
1312 ByteStreamWriter tsw[]=new ByteStreamWriter[len]; | |
1313 for(int i=0; i<len; i++){ | |
1314 tsw[i]=new ByteStreamWriter(ff[i]); | |
1315 tsw[i].start(); | |
1316 } | |
1317 boolean error=false; | |
1318 for(int i=0; i<sketches.size(); i++){ | |
1319 write(sketches.get(i), tsw[i%len], new ByteBuilder()); | |
1320 } | |
1321 for(int i=0; i<len; i++){ | |
1322 error|=tsw[i].poisonAndWait(); | |
1323 } | |
1324 return error; | |
1325 } | |
1326 | |
1327 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff){ | |
1328 final ByteStreamWriter tsw=new ByteStreamWriter(ff); | |
1329 final ByteBuilder bb=new ByteBuilder(); | |
1330 tsw.start(); | |
1331 for(Sketch sketch : sketches){ | |
1332 write(sketch, tsw, bb); | |
1333 } | |
1334 return tsw.poisonAndWait(); | |
1335 } | |
1336 | |
1337 public static boolean write(Sketch sketch, FileFormat ff){ | |
1338 // System.err.println(ff.name()+", "+new File(ff.name()).exists()); | |
1339 ByteStreamWriter tsw=new ByteStreamWriter(ff); | |
1340 // assert(false) : new File(ff.name()).exists(); | |
1341 tsw.start(); | |
1342 write(sketch, tsw, null); | |
1343 return tsw.poisonAndWait(); | |
1344 } | |
1345 | |
1346 public static void write(Sketch sketch, ByteStreamWriter tsw, ByteBuilder bb){ | |
1347 if(bb==null){bb=new ByteBuilder();} | |
1348 else{bb.clear();} | |
1349 sketch.toBytes(bb); | |
1350 tsw.print(bb); | |
1351 } | |
1352 | |
1353 /*--------------------------------------------------------------*/ | |
1354 /*---------------- Fields ----------------*/ | |
1355 /*--------------------------------------------------------------*/ | |
1356 | |
1357 // final EntropyTracker eTracker; | |
1358 final int stTargetSketchSize; | |
1359 public final int minKeyOccuranceCount; | |
1360 /** Force kmer counts to be tracked. */ | |
1361 public final boolean trackCounts; | |
1362 /** Merge reads before processing kmers. */ | |
1363 public final boolean mergePairs; | |
1364 | |
1365 public static int BUFLEN=16384; | |
1366 public static boolean BUFFERED_READER=false; | |
1367 | |
1368 /*--------------------------------------------------------------*/ | |
1369 /*---------------- Static Fields ----------------*/ | |
1370 /*--------------------------------------------------------------*/ | |
1371 | |
1372 // public static boolean verbose=false; | |
1373 | |
1374 } |