jpayne@68
|
1 package sketch;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.File;
|
jpayne@68
|
4 import java.io.IOException;
|
jpayne@68
|
5 import java.io.InputStream;
|
jpayne@68
|
6 import java.util.ArrayList;
|
jpayne@68
|
7 import java.util.Collection;
|
jpayne@68
|
8 import java.util.concurrent.ConcurrentLinkedQueue;
|
jpayne@68
|
9 import java.util.concurrent.atomic.AtomicInteger;
|
jpayne@68
|
10
|
jpayne@68
|
11 import dna.AminoAcid;
|
jpayne@68
|
12 import fileIO.ByteFile;
|
jpayne@68
|
13 import fileIO.ByteStreamWriter;
|
jpayne@68
|
14 import fileIO.FileFormat;
|
jpayne@68
|
15 import fileIO.ReadWrite;
|
jpayne@68
|
16 import kmer.HashArray1D;
|
jpayne@68
|
17 import kmer.HashForest;
|
jpayne@68
|
18 import kmer.KmerNode;
|
jpayne@68
|
19 import kmer.KmerTableSet;
|
jpayne@68
|
20 import shared.KillSwitch;
|
jpayne@68
|
21 import shared.Parse;
|
jpayne@68
|
22 import shared.Shared;
|
jpayne@68
|
23 import shared.Tools;
|
jpayne@68
|
24 import stream.ConcurrentReadInputStream;
|
jpayne@68
|
25 import stream.Read;
|
jpayne@68
|
26 import structures.ByteBuilder;
|
jpayne@68
|
27 import structures.IntList;
|
jpayne@68
|
28 import structures.ListNum;
|
jpayne@68
|
29 import structures.LongList;
|
jpayne@68
|
30 import structures.StringNum;
|
jpayne@68
|
31
|
jpayne@68
|
32 /**
|
jpayne@68
|
33 * @author Brian Bushnell
|
jpayne@68
|
34 * @date June 28, 2016
|
jpayne@68
|
35 *
|
jpayne@68
|
36 */
|
jpayne@68
|
37 public final class SketchTool extends SketchObject {
|
jpayne@68
|
38
|
jpayne@68
|
39 /*--------------------------------------------------------------*/
|
jpayne@68
|
40 /*---------------- Constructor ----------------*/
|
jpayne@68
|
41 /*--------------------------------------------------------------*/
|
jpayne@68
|
42
|
jpayne@68
|
43 public SketchTool(int size_, DisplayParams params){
|
jpayne@68
|
44 this(size_, params.minKeyOccuranceCount, params.trackCounts(), params.mergePairs);
|
jpayne@68
|
45 }
|
jpayne@68
|
46
|
jpayne@68
|
47 public SketchTool(int size_, int minKeyOccuranceCount_, boolean trackCounts_, boolean mergePairs_){
|
jpayne@68
|
48 stTargetSketchSize=size_;
|
jpayne@68
|
49 minKeyOccuranceCount=minKeyOccuranceCount_;
|
jpayne@68
|
50 trackCounts=trackCounts_;
|
jpayne@68
|
51 mergePairs=mergePairs_;
|
jpayne@68
|
52
|
jpayne@68
|
53 assert(!aminoOrTranslate() || !rcomp) : "rcomp should be false in amino mode.";
|
jpayne@68
|
54 assert(!aminoOrTranslate() || (k*AminoAcid.AMINO_SHIFT<64)) : "Protein sketches require 1 <= K <= "+(63/AminoAcid.AMINO_SHIFT)+".";
|
jpayne@68
|
55 assert(k>0 && k<=32) : "Sketches require 1 <= K <= 32."; //123
|
jpayne@68
|
56 }
|
jpayne@68
|
57
|
jpayne@68
|
58 /*--------------------------------------------------------------*/
|
jpayne@68
|
59 /*---------------- Methods ----------------*/
|
jpayne@68
|
60 /*--------------------------------------------------------------*/
|
jpayne@68
|
61
|
jpayne@68
|
62 public Sketch toSketch(KmerTableSet tables, boolean multithreaded){
|
jpayne@68
|
63 final int threads=(multithreaded ? Tools.mid(1, Shared.threads(), tables.ways()) : 1);
|
jpayne@68
|
64 return (threads<2 ? toSketch_ST(tables) : toSketch_MT(tables, threads));
|
jpayne@68
|
65 }
|
jpayne@68
|
66
|
jpayne@68
|
67 private Sketch toSketch_ST(KmerTableSet tables){
|
jpayne@68
|
68 SketchHeap heap=(stTargetSketchSize>0 ? new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts) : null);
|
jpayne@68
|
69 LongList list=new LongList();
|
jpayne@68
|
70
|
jpayne@68
|
71 KmerTableSet kts=(KmerTableSet)tables;
|
jpayne@68
|
72 for(int tnum=0; tnum<kts.ways; tnum++){
|
jpayne@68
|
73 HashArray1D table=kts.getTable(tnum);
|
jpayne@68
|
74 if(stTargetSketchSize>0){
|
jpayne@68
|
75 toHeap(table, heap);
|
jpayne@68
|
76 }else{
|
jpayne@68
|
77 toList(table, list);
|
jpayne@68
|
78 }
|
jpayne@68
|
79 }
|
jpayne@68
|
80 return stTargetSketchSize>0 ? new Sketch(heap, false, trackCounts, null) : toSketch(list);//TODO: Could add counts here
|
jpayne@68
|
81 }
|
jpayne@68
|
82
|
jpayne@68
|
83 private Sketch toSketch_MT(KmerTableSet tables, final int threads){
|
jpayne@68
|
84 ArrayList<SketchThread> alst=new ArrayList<SketchThread>(threads);
|
jpayne@68
|
85 AtomicInteger ai=new AtomicInteger(0);
|
jpayne@68
|
86 for(int i=0; i<threads; i++){
|
jpayne@68
|
87 alst.add(new SketchThread(ai, tables));
|
jpayne@68
|
88 }
|
jpayne@68
|
89
|
jpayne@68
|
90 //Start the threads
|
jpayne@68
|
91 for(SketchThread pt : alst){
|
jpayne@68
|
92 pt.start();
|
jpayne@68
|
93 }
|
jpayne@68
|
94
|
jpayne@68
|
95 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads);
|
jpayne@68
|
96 LongList list=new LongList();
|
jpayne@68
|
97
|
jpayne@68
|
98 for(SketchThread pt : alst){
|
jpayne@68
|
99
|
jpayne@68
|
100 //Wait until this thread has terminated
|
jpayne@68
|
101 while(pt.getState()!=Thread.State.TERMINATED){
|
jpayne@68
|
102 try {
|
jpayne@68
|
103 //Attempt a join operation
|
jpayne@68
|
104 pt.join();
|
jpayne@68
|
105 } catch (InterruptedException e) {
|
jpayne@68
|
106 //Potentially handle this, if it is expected to occur
|
jpayne@68
|
107 e.printStackTrace();
|
jpayne@68
|
108 }
|
jpayne@68
|
109 }
|
jpayne@68
|
110
|
jpayne@68
|
111 if(stTargetSketchSize>=0){
|
jpayne@68
|
112 if(pt.heap!=null && pt.heap.size()>0){
|
jpayne@68
|
113 heaps.add(pt.heap);
|
jpayne@68
|
114 }
|
jpayne@68
|
115 }else{
|
jpayne@68
|
116 if(pt.list!=null){list.append(pt.list);}
|
jpayne@68
|
117 pt.list=null;
|
jpayne@68
|
118 }
|
jpayne@68
|
119 }
|
jpayne@68
|
120 alst.clear();
|
jpayne@68
|
121
|
jpayne@68
|
122 return stTargetSketchSize>=0 ? toSketch(heaps, true) : toSketch(list);//TODO: Could add counts here
|
jpayne@68
|
123 }
|
jpayne@68
|
124
|
jpayne@68
|
125 public SketchHeap toHeap(HashArray1D table, SketchHeap heap){
|
jpayne@68
|
126 // if(heap==null){heap=new LongHeap(size, true);}
|
jpayne@68
|
127 long[] kmers=table.array();
|
jpayne@68
|
128 int[] counts=table.values();
|
jpayne@68
|
129 for(int i=0; i<table.arrayLength(); i++){
|
jpayne@68
|
130 int count=counts[i];
|
jpayne@68
|
131 if(count>=minKeyOccuranceCount){
|
jpayne@68
|
132 heap.genomeSizeKmers++;
|
jpayne@68
|
133 long kmer=kmers[i];
|
jpayne@68
|
134 long hashcode=hash(kmer);
|
jpayne@68
|
135 if(hashcode>=minHashValue){
|
jpayne@68
|
136 heap.add(hashcode);
|
jpayne@68
|
137 }
|
jpayne@68
|
138 }
|
jpayne@68
|
139 }
|
jpayne@68
|
140 HashForest forest=table.victims();
|
jpayne@68
|
141 if(forest!=null){
|
jpayne@68
|
142 for(KmerNode kn : forest.array()){
|
jpayne@68
|
143 if(kn!=null){addRecursive(heap, kn);}
|
jpayne@68
|
144 }
|
jpayne@68
|
145 }
|
jpayne@68
|
146 return heap;
|
jpayne@68
|
147 }
|
jpayne@68
|
148
|
jpayne@68
|
149 public LongList toList(HashArray1D table, LongList list){
|
jpayne@68
|
150 // if(heap==null){heap=new LongHeap(size, true);}
|
jpayne@68
|
151 long[] kmers=table.array();
|
jpayne@68
|
152 int[] counts=table.values();
|
jpayne@68
|
153 for(int i=0; i<table.arrayLength(); i++){
|
jpayne@68
|
154 int count=counts[i];
|
jpayne@68
|
155 if(count>=minKeyOccuranceCount){
|
jpayne@68
|
156 long kmer=kmers[i];
|
jpayne@68
|
157 long hashcode=hash(kmer);
|
jpayne@68
|
158 if(hashcode>=minHashValue){
|
jpayne@68
|
159 list.add(hashcode);
|
jpayne@68
|
160 }
|
jpayne@68
|
161 }
|
jpayne@68
|
162 }
|
jpayne@68
|
163 HashForest forest=table.victims();
|
jpayne@68
|
164 if(forest!=null){
|
jpayne@68
|
165 for(KmerNode kn : forest.array()){
|
jpayne@68
|
166 if(kn!=null){addRecursive(list, kn);}
|
jpayne@68
|
167 }
|
jpayne@68
|
168 }
|
jpayne@68
|
169 return list;
|
jpayne@68
|
170 }
|
jpayne@68
|
171
|
jpayne@68
|
172 // public long[] toSketchArray(ArrayList<LongHeap> heaps){
|
jpayne@68
|
173 // if(heaps.size()==1){return toSketchArray(heaps.get(0));}
|
jpayne@68
|
174 // LongList list=new LongList(size);
|
jpayne@68
|
175 // for(LongHeap heap : heaps){
|
jpayne@68
|
176 // while(heap.size()>0){list.add(Long.MAX_VALUE-heap.poll());}
|
jpayne@68
|
177 // }
|
jpayne@68
|
178 // list.sort();
|
jpayne@68
|
179 // list.shrinkToUnique();
|
jpayne@68
|
180 // list.size=Tools.min(size, list.size);
|
jpayne@68
|
181 // return list.toArray();
|
jpayne@68
|
182 // }
|
jpayne@68
|
183
|
jpayne@68
|
184 public Sketch toSketch(ArrayList<SketchHeap> heaps, boolean allowZeroSizeSketch){
|
jpayne@68
|
185 if(heaps==null || heaps.isEmpty()){
|
jpayne@68
|
186 if(allowZeroSizeSketch){
|
jpayne@68
|
187 return new Sketch(new long[0], null, null, null, null, null);
|
jpayne@68
|
188 }else{
|
jpayne@68
|
189 return null;
|
jpayne@68
|
190 }
|
jpayne@68
|
191 }
|
jpayne@68
|
192 SketchHeap a=heaps.get(0);
|
jpayne@68
|
193 for(int i=1; i<heaps.size(); i++){
|
jpayne@68
|
194 SketchHeap b=heaps.get(i);
|
jpayne@68
|
195 a.add(b);
|
jpayne@68
|
196 }
|
jpayne@68
|
197 if(verbose2){System.err.println("Creating a sketch of size "+a.size()+".");}
|
jpayne@68
|
198 return new Sketch(a, false, trackCounts, null);
|
jpayne@68
|
199 }
|
jpayne@68
|
200
|
jpayne@68
|
201 Sketch toSketch(LongList list){
|
jpayne@68
|
202 list.sort();
|
jpayne@68
|
203 assert(list.size==0 || list.get(list.size()-1)>=minHashValue) : list.size+", "+list.get(list.size()-1)+", "+minHashValue;
|
jpayne@68
|
204 list.shrinkToUnique();
|
jpayne@68
|
205 list.reverse();
|
jpayne@68
|
206 for(int i=0; i<list.size; i++){list.array[i]=Long.MAX_VALUE-list.array[i];}
|
jpayne@68
|
207 return new Sketch(list.toArray(), null, null, null, null, null);
|
jpayne@68
|
208 }
|
jpayne@68
|
209
|
jpayne@68
|
210 /*--------------------------------------------------------------*/
|
jpayne@68
|
211 /*---------------- Helpers ----------------*/
|
jpayne@68
|
212 /*--------------------------------------------------------------*/
|
jpayne@68
|
213
|
jpayne@68
|
214 private void addRecursive(SketchHeap heap, KmerNode kn){
|
jpayne@68
|
215 if(kn==null){return;}
|
jpayne@68
|
216 if(kn.count()>=minKeyOccuranceCount){
|
jpayne@68
|
217 heap.genomeSizeKmers++;
|
jpayne@68
|
218 long kmer=kn.pivot();
|
jpayne@68
|
219 long hashcode=hash(kmer);
|
jpayne@68
|
220 if(hashcode>=minHashValue){heap.add(hashcode);}
|
jpayne@68
|
221 }
|
jpayne@68
|
222 if(kn.left()!=null){addRecursive(heap, kn.left());}
|
jpayne@68
|
223 if(kn.right()!=null){addRecursive(heap, kn.right());}
|
jpayne@68
|
224 }
|
jpayne@68
|
225
|
jpayne@68
|
226 private void addRecursive(LongList list, KmerNode kn){
|
jpayne@68
|
227 if(kn==null){return;}
|
jpayne@68
|
228 if(kn.count()>=minKeyOccuranceCount){
|
jpayne@68
|
229 long kmer=kn.pivot();
|
jpayne@68
|
230 long hashcode=hash(kmer);
|
jpayne@68
|
231 if(hashcode>=minHashValue){list.add(hashcode);}
|
jpayne@68
|
232 }
|
jpayne@68
|
233 if(kn.left()!=null){addRecursive(list, kn.left());}
|
jpayne@68
|
234 if(kn.right()!=null){addRecursive(list, kn.right());}
|
jpayne@68
|
235 }
|
jpayne@68
|
236
|
jpayne@68
|
237 /*--------------------------------------------------------------*/
|
jpayne@68
|
238 /*---------------- I/O ----------------*/
|
jpayne@68
|
239 /*--------------------------------------------------------------*/
|
jpayne@68
|
240
|
jpayne@68
|
241 // public static ArrayList<Sketch> loadSketches_ST(String...fnames){
|
jpayne@68
|
242 // ArrayList<Sketch> sketches=null;
|
jpayne@68
|
243 // for(String s : fnames){
|
jpayne@68
|
244 // ArrayList<Sketch> temp;
|
jpayne@68
|
245 // if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){
|
jpayne@68
|
246 // temp=loadSketches(s);
|
jpayne@68
|
247 // }else{
|
jpayne@68
|
248 // temp=loadSketches_ST(s.split(","));
|
jpayne@68
|
249 // }
|
jpayne@68
|
250 // if(sketches==null){sketches=temp;}
|
jpayne@68
|
251 // else{sketches.addAll(temp);}
|
jpayne@68
|
252 // }
|
jpayne@68
|
253 // return sketches;
|
jpayne@68
|
254 // }
|
jpayne@68
|
255
|
jpayne@68
|
256 // public static ArrayList<Sketch> loadSketches_MT(ArrayList<String> fnames){
|
jpayne@68
|
257 // return loadSketches_MT(0, null, fnames.toArray(new String[0]));
|
jpayne@68
|
258 // }
|
jpayne@68
|
259
|
jpayne@68
|
260 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, Collection<String> fnames){
|
jpayne@68
|
261 return loadSketches_MT(params.mode, params.samplerate, params.maxReads,
|
jpayne@68
|
262 params.minEntropy, params.minProb, params.minQual, fnames);
|
jpayne@68
|
263 }
|
jpayne@68
|
264
|
jpayne@68
|
265 public ArrayList<Sketch> loadSketches_MT(DisplayParams params, String...fnames){
|
jpayne@68
|
266 return loadSketches_MT(params.mode, params.samplerate, params.maxReads,
|
jpayne@68
|
267 params.minEntropy, params.minProb, params.minQual, fnames);
|
jpayne@68
|
268 }
|
jpayne@68
|
269
|
jpayne@68
|
270 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, Collection<String> fnames){
|
jpayne@68
|
271 return loadSketches_MT(mode, samplerate, reads, minEntropy, minProb, minQual, fnames.toArray(new String[0]));
|
jpayne@68
|
272 }
|
jpayne@68
|
273
|
jpayne@68
|
274 //TODO: This is only multithreaded per file in persequence mode.
|
jpayne@68
|
275 public ArrayList<Sketch> loadSketches_MT(int mode, float samplerate, long reads, float minEntropy, float minProb, byte minQual, String...fnames){
|
jpayne@68
|
276
|
jpayne@68
|
277 ConcurrentLinkedQueue<StringNum> decomposedFnames=new ConcurrentLinkedQueue<StringNum>();
|
jpayne@68
|
278 int num=0;
|
jpayne@68
|
279 for(String s : fnames){
|
jpayne@68
|
280 if(s.indexOf(',')<0 || s.startsWith("stdin") || new File(s).exists()){
|
jpayne@68
|
281 num++;
|
jpayne@68
|
282 decomposedFnames.add(new StringNum(s, num));
|
jpayne@68
|
283 }else{
|
jpayne@68
|
284 for(String s2 : s.split(",")){
|
jpayne@68
|
285 num++;
|
jpayne@68
|
286 decomposedFnames.add(new StringNum(s2, num));
|
jpayne@68
|
287 }
|
jpayne@68
|
288 }
|
jpayne@68
|
289 }
|
jpayne@68
|
290
|
jpayne@68
|
291 if(decomposedFnames.size()==0){return null;}
|
jpayne@68
|
292 if(decomposedFnames.size()==1){return loadSketchesFromFile(decomposedFnames.poll().s, null, 0, reads, mode, samplerate, minEntropy, minProb, minQual, false);}
|
jpayne@68
|
293
|
jpayne@68
|
294 //Determine how many threads may be used
|
jpayne@68
|
295 final int threads=Tools.min(Shared.threads(), decomposedFnames.size());
|
jpayne@68
|
296
|
jpayne@68
|
297 //Fill a list with LoadThreads
|
jpayne@68
|
298 ArrayList<LoadThread> allt=new ArrayList<LoadThread>(threads);
|
jpayne@68
|
299
|
jpayne@68
|
300 for(int i=0; i<threads; i++){
|
jpayne@68
|
301 allt.add(new LoadThread(decomposedFnames, mode, samplerate, reads, minEntropy, minProb, minQual));
|
jpayne@68
|
302 }
|
jpayne@68
|
303
|
jpayne@68
|
304 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
|
jpayne@68
|
305
|
jpayne@68
|
306 //Start the threads
|
jpayne@68
|
307 for(LoadThread lt : allt){lt.start();}
|
jpayne@68
|
308
|
jpayne@68
|
309 //Wait for completion of all threads
|
jpayne@68
|
310 boolean success=true;
|
jpayne@68
|
311 for(LoadThread lt : allt){
|
jpayne@68
|
312
|
jpayne@68
|
313 //Wait until this thread has terminated
|
jpayne@68
|
314 while(lt.getState()!=Thread.State.TERMINATED){
|
jpayne@68
|
315 try {
|
jpayne@68
|
316 //Attempt a join operation
|
jpayne@68
|
317 lt.join();
|
jpayne@68
|
318 } catch (InterruptedException e) {
|
jpayne@68
|
319 //Potentially handle this, if it is expected to occur
|
jpayne@68
|
320 e.printStackTrace();
|
jpayne@68
|
321 }
|
jpayne@68
|
322 }
|
jpayne@68
|
323 sketches.addAll(lt.list);
|
jpayne@68
|
324 success&=lt.success;
|
jpayne@68
|
325 }
|
jpayne@68
|
326 assert(success) : "Failure loading some files.";
|
jpayne@68
|
327 return sketches;
|
jpayne@68
|
328 }
|
jpayne@68
|
329
|
jpayne@68
|
330 /*--------------------------------------------------------------*/
|
jpayne@68
|
331
|
jpayne@68
|
332 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm,
|
jpayne@68
|
333 int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){
|
jpayne@68
|
334 return loadSketchesFromFile(fname0, smm, maxThreads, reads, mode,
|
jpayne@68
|
335 params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch);
|
jpayne@68
|
336 }
|
jpayne@68
|
337
|
jpayne@68
|
338 public ArrayList<Sketch> loadSketchesFromFile(final String fname0, SketchMakerMini smm,
|
jpayne@68
|
339 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
|
jpayne@68
|
340 assert(fname0!=null);//123
|
jpayne@68
|
341 if(fname0==null){return null;}
|
jpayne@68
|
342 FileFormat ff=FileFormat.testInput(fname0, FileFormat.FASTA, null, false, true);
|
jpayne@68
|
343 if(ff.isSequence()){
|
jpayne@68
|
344 return loadSketchesFromSequenceFile(ff, smm, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
|
jpayne@68
|
345 }else{
|
jpayne@68
|
346 return SketchObject.LOADER2 ? loadSketchesFromSketchFile2(ff, allowZeroSizeSketch) : loadSketchesFromSketchFile(ff, allowZeroSizeSketch);
|
jpayne@68
|
347 }
|
jpayne@68
|
348 }
|
jpayne@68
|
349
|
jpayne@68
|
350 private ArrayList<Sketch> loadSketchesFromSequenceFile(final FileFormat ff, SketchMakerMini smm,
|
jpayne@68
|
351 int maxThreads, long reads, int mode, float samplerate, float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
|
jpayne@68
|
352 maxThreads=(maxThreads<1 ? Shared.threads() : Tools.min(maxThreads, Shared.threads()));
|
jpayne@68
|
353
|
jpayne@68
|
354 // assert(false) : (ff.fasta() || ff.fastq() || ff.samOrBam())+", "+ff.fastq()+", "+maxThreads+", "+
|
jpayne@68
|
355 // allowMultithreadedFastq+", "+forceDisableMultithreadedFastq+", "+(mode==ONE_SKETCH);
|
jpayne@68
|
356
|
jpayne@68
|
357 if(ff.fastq() && allowMultithreadedFastq && !forceDisableMultithreadedFastq && (mode==ONE_SKETCH || mode==PER_FILE) &&
|
jpayne@68
|
358 maxThreads>1 && Shared.threads()>2 && (reads<1 || reads*samplerate*(mergePairs ? 2 : 1)>=1000)){//limit is low due to PacBio long reads
|
jpayne@68
|
359
|
jpayne@68
|
360 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
|
jpayne@68
|
361 Read.VALIDATE_IN_CONSTRUCTOR=false;
|
jpayne@68
|
362
|
jpayne@68
|
363 if(verbose2){System.err.println("Loading a sketch multithreaded.");}
|
jpayne@68
|
364 Sketch sketch=processReadsMT(ff.name(), maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
|
jpayne@68
|
365
|
jpayne@68
|
366 Read.VALIDATE_IN_CONSTRUCTOR=vic;
|
jpayne@68
|
367
|
jpayne@68
|
368 ArrayList<Sketch> list=new ArrayList<Sketch>(1);
|
jpayne@68
|
369 if(sketch!=null && (sketch.length()>0 || allowZeroSizeSketch)){
|
jpayne@68
|
370 sketch.loadSSU();
|
jpayne@68
|
371 list.add(sketch);
|
jpayne@68
|
372 }
|
jpayne@68
|
373 return list;
|
jpayne@68
|
374 }
|
jpayne@68
|
375 if(smm==null){smm=new SketchMakerMini(this, mode, minEntropy, minProb, minQual);}
|
jpayne@68
|
376 if(verbose2){System.err.println("Loading sketches via SMM.");}
|
jpayne@68
|
377 ArrayList<Sketch> sketches=smm.toSketches(ff.name(), samplerate, reads);
|
jpayne@68
|
378 if(verbose2){System.err.println("Loaded "+(sketches==null ? 0 : sketches.size())+" sketches via SMM.");}
|
jpayne@68
|
379 return sketches;
|
jpayne@68
|
380 }
|
jpayne@68
|
381
|
jpayne@68
|
382 private ArrayList<Sketch> loadSketchesFromSketchFile(final FileFormat ff, boolean allowZeroSizeSketch){
|
jpayne@68
|
383
|
jpayne@68
|
384 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
|
jpayne@68
|
385
|
jpayne@68
|
386 if(verbose2){System.err.println("Loading sketches from text.");}
|
jpayne@68
|
387 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
|
jpayne@68
|
388 ByteFile bf=ByteFile.makeByteFile(ff);
|
jpayne@68
|
389 int currentSketchSize=stTargetSketchSize;
|
jpayne@68
|
390 int taxID=-1;
|
jpayne@68
|
391 long spid=-1;
|
jpayne@68
|
392 long imgID=-1;
|
jpayne@68
|
393 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
|
jpayne@68
|
394 long[] baseCounts=null;
|
jpayne@68
|
395 byte[] r16S=null;
|
jpayne@68
|
396 byte[] r18S=null;
|
jpayne@68
|
397 int r16SLen=0;
|
jpayne@68
|
398 int r18SLen=0;
|
jpayne@68
|
399 float probCorrect=-1;
|
jpayne@68
|
400 String name=null, name0=null, fname=ff.simpleName();
|
jpayne@68
|
401 LongList list=null;
|
jpayne@68
|
402 IntList countList=null;
|
jpayne@68
|
403 ArrayList<String> meta=null;
|
jpayne@68
|
404 long sum=0;
|
jpayne@68
|
405 byte[] lastHeader=null;
|
jpayne@68
|
406
|
jpayne@68
|
407 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
|
jpayne@68
|
408 if(line.length>0){
|
jpayne@68
|
409 // System.err.println("Processing line "+new String(line));
|
jpayne@68
|
410 if(line[0]=='#'){
|
jpayne@68
|
411 if(r16SLen>0 || r18SLen>0){
|
jpayne@68
|
412 byte[] ssu=KillSwitch.copyOfRange(line, 5, line.length);
|
jpayne@68
|
413 if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){
|
jpayne@68
|
414 assert(r16SLen>0);
|
jpayne@68
|
415 assert(ssu.length==r16SLen) : r16SLen+", "+line.length+"\n"+new String(line)+"\n";
|
jpayne@68
|
416 r16S=ssu;
|
jpayne@68
|
417 r16SLen=0;
|
jpayne@68
|
418 }else if(Tools.startsWith(line, "#18S:")){
|
jpayne@68
|
419 assert(r18SLen>0);
|
jpayne@68
|
420 assert(ssu.length==r18SLen) : r18SLen+", "+line.length+"\n"+new String(line)+"\n";
|
jpayne@68
|
421 r18S=ssu;
|
jpayne@68
|
422 r18SLen=0;
|
jpayne@68
|
423 }else{
|
jpayne@68
|
424 assert(false) : new String(line);
|
jpayne@68
|
425 }
|
jpayne@68
|
426 }else{
|
jpayne@68
|
427 lastHeader=line;
|
jpayne@68
|
428 if(list!=null){
|
jpayne@68
|
429 assert(list.size==list.array.length);
|
jpayne@68
|
430 if(NUC || unsorted){
|
jpayne@68
|
431 list.sort();
|
jpayne@68
|
432 list.shrinkToUnique();
|
jpayne@68
|
433 }else{
|
jpayne@68
|
434 list.shrink();
|
jpayne@68
|
435 }
|
jpayne@68
|
436 if(list.size()>0 || allowZeroSizeSketch){
|
jpayne@68
|
437 int[] keyCounts=countList==null ? null : countList.array;
|
jpayne@68
|
438 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
|
jpayne@68
|
439 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
|
jpayne@68
|
440 sketch.spid=spid;
|
jpayne@68
|
441 sketches.add(sketch);
|
jpayne@68
|
442 }
|
jpayne@68
|
443 // System.err.println("Made sketch "+sketch);
|
jpayne@68
|
444 }
|
jpayne@68
|
445 name=name0=null;
|
jpayne@68
|
446 fname=ff.simpleName();
|
jpayne@68
|
447 list=null;
|
jpayne@68
|
448 countList=null;
|
jpayne@68
|
449 meta=null;
|
jpayne@68
|
450 baseCounts=null;
|
jpayne@68
|
451 r16S=null;
|
jpayne@68
|
452 r18S=null;
|
jpayne@68
|
453 r16SLen=0;
|
jpayne@68
|
454 r18SLen=0;
|
jpayne@68
|
455 sum=0;
|
jpayne@68
|
456 taxID=-1;
|
jpayne@68
|
457 imgID=-1;
|
jpayne@68
|
458 genomeSizeBases=0;
|
jpayne@68
|
459 genomeSizeKmers=0;
|
jpayne@68
|
460 genomeSequences=0;
|
jpayne@68
|
461 probCorrect=-1;
|
jpayne@68
|
462 int k_sketch=defaultK;
|
jpayne@68
|
463 int k2_sketch=0;
|
jpayne@68
|
464 int hashVersion_sketch=1;
|
jpayne@68
|
465
|
jpayne@68
|
466 if(line.length>1){
|
jpayne@68
|
467 String[] split=new String(line, 1, line.length-1).split("\t");
|
jpayne@68
|
468 for(String s : split){
|
jpayne@68
|
469 final int colon=s.indexOf(':');
|
jpayne@68
|
470 final String sub=s.substring(colon+1);
|
jpayne@68
|
471 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
|
jpayne@68
|
472 currentSketchSize=Integer.parseInt(sub);
|
jpayne@68
|
473 }else if(s.startsWith("CD:")){//Coding
|
jpayne@68
|
474 A48=HEX=NUC=delta=counts=unsorted=false;
|
jpayne@68
|
475
|
jpayne@68
|
476 for(int i=0; i<sub.length(); i++){
|
jpayne@68
|
477 char c=sub.charAt(i);
|
jpayne@68
|
478 if(c=='A'){A48=true;}
|
jpayne@68
|
479 else if(c=='H'){HEX=true;}
|
jpayne@68
|
480 else if(c=='R'){A48=HEX=false;}
|
jpayne@68
|
481 else if(c=='N'){NUC=true;}
|
jpayne@68
|
482 else if(c=='D'){delta=true;}
|
jpayne@68
|
483 else if(c=='C'){counts=true;}
|
jpayne@68
|
484 else if(c=='U'){unsorted=true;}
|
jpayne@68
|
485 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);}
|
jpayne@68
|
486 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);}
|
jpayne@68
|
487 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);}
|
jpayne@68
|
488 }
|
jpayne@68
|
489 }else if(s.startsWith("K:")){//Kmer length
|
jpayne@68
|
490 if(sub.indexOf(',')>=0){
|
jpayne@68
|
491 String[] subsplit=sub.split(",");
|
jpayne@68
|
492 assert(subsplit.length==2) : "Bad header component "+s;
|
jpayne@68
|
493 int x=Integer.parseInt(subsplit[0]);
|
jpayne@68
|
494 int y=Integer.parseInt(subsplit[1]);
|
jpayne@68
|
495 k_sketch=Tools.max(x, y);
|
jpayne@68
|
496 k2_sketch=Tools.min(x, y);
|
jpayne@68
|
497 }else{
|
jpayne@68
|
498 k_sketch=Integer.parseInt(sub);
|
jpayne@68
|
499 k2_sketch=0;
|
jpayne@68
|
500 }
|
jpayne@68
|
501 }else if(s.startsWith("H:")){//Hash version
|
jpayne@68
|
502 hashVersion_sketch=Integer.parseInt(sub);
|
jpayne@68
|
503 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
|
jpayne@68
|
504 baseCounts=Parse.parseLongArray(sub);
|
jpayne@68
|
505 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
|
jpayne@68
|
506 genomeSizeBases=Long.parseLong(sub);
|
jpayne@68
|
507 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
|
jpayne@68
|
508 genomeSizeKmers=Long.parseLong(sub);
|
jpayne@68
|
509 }else if(s.startsWith("GQ:")){
|
jpayne@68
|
510 genomeSequences=Long.parseLong(sub);
|
jpayne@68
|
511 }else if(s.startsWith("GE:")){//Genome size estimate kmers
|
jpayne@68
|
512 //ignore
|
jpayne@68
|
513 }else if(s.startsWith("PC:")){//Probability of correctness
|
jpayne@68
|
514 probCorrect=Float.parseFloat(sub);
|
jpayne@68
|
515 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
|
jpayne@68
|
516 taxID=Integer.parseInt(sub);
|
jpayne@68
|
517 }else if(s.startsWith("IMG:")){
|
jpayne@68
|
518 imgID=Long.parseLong(sub);
|
jpayne@68
|
519 }else if(s.startsWith("SPID:")){
|
jpayne@68
|
520 spid=Integer.parseInt(sub);
|
jpayne@68
|
521 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
|
jpayne@68
|
522 name=sub;
|
jpayne@68
|
523 }else if(s.startsWith("FN:")){
|
jpayne@68
|
524 fname=sub;
|
jpayne@68
|
525 }else if(s.startsWith("NM0:")){
|
jpayne@68
|
526 name0=sub;
|
jpayne@68
|
527 }else if(s.startsWith("MT_")){
|
jpayne@68
|
528 if(meta==null){meta=new ArrayList<String>(1);}
|
jpayne@68
|
529 meta.add(s.substring(3));
|
jpayne@68
|
530 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
|
jpayne@68
|
531 r16SLen=Integer.parseInt(sub);
|
jpayne@68
|
532 }else if(s.startsWith("18S:")){
|
jpayne@68
|
533 r18SLen=Integer.parseInt(sub);
|
jpayne@68
|
534 }else{
|
jpayne@68
|
535 assert(false) : "Unsupported header tag "+s;
|
jpayne@68
|
536 }
|
jpayne@68
|
537 }
|
jpayne@68
|
538 }
|
jpayne@68
|
539
|
jpayne@68
|
540 if(KILL_OK){
|
jpayne@68
|
541 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+
|
jpayne@68
|
542 " differs from loaded kmer length "+k+"\n"+new String(line)+"\nfile: "+ff.name());}
|
jpayne@68
|
543 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+
|
jpayne@68
|
544 " differs from loaded kmer length "+k+","+k2+"\n"+new String(line)+"\nfile: "+ff.name());}
|
jpayne@68
|
545 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
|
jpayne@68
|
546 " differs from loaded hash version "+HASH_VERSION+".\n"
|
jpayne@68
|
547 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\nfile: "+ff.name());}
|
jpayne@68
|
548 }else{//Potential hang
|
jpayne@68
|
549 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line);
|
jpayne@68
|
550 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line);
|
jpayne@68
|
551 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
|
jpayne@68
|
552 " differs from loaded hash version "+HASH_VERSION+".\n"
|
jpayne@68
|
553 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n";
|
jpayne@68
|
554 }
|
jpayne@68
|
555 if(currentSketchSize>0 || allowZeroSizeSketch){
|
jpayne@68
|
556 list=new LongList(Tools.max(1, currentSketchSize));
|
jpayne@68
|
557 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));}
|
jpayne@68
|
558 }
|
jpayne@68
|
559 }
|
jpayne@68
|
560 }else{
|
jpayne@68
|
561 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) :
|
jpayne@68
|
562 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Parse.parseLong(line));
|
jpayne@68
|
563 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
|
jpayne@68
|
564 sum+=x;
|
jpayne@68
|
565 assert(x>=0 || NUC) : "x="+x+"\nline="+new String(line)+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\nlineNum="+bf.lineNum()+"\n";
|
jpayne@68
|
566 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
|
jpayne@68
|
567 assert(list!=null) : new String(line);
|
jpayne@68
|
568 long key=(delta ? sum : x);
|
jpayne@68
|
569 if(key>=0){list.add(key);}
|
jpayne@68
|
570 }
|
jpayne@68
|
571 }
|
jpayne@68
|
572 }
|
jpayne@68
|
573 if(list!=null && (list.size>0 || allowZeroSizeSketch)){
|
jpayne@68
|
574 assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) : list.size+"!="+list.array.length;
|
jpayne@68
|
575 if(NUC || unsorted){
|
jpayne@68
|
576 list.sort();
|
jpayne@68
|
577 list.shrinkToUnique();
|
jpayne@68
|
578 }else{
|
jpayne@68
|
579 list.shrink();
|
jpayne@68
|
580 }
|
jpayne@68
|
581 int[] keyCounts=countList==null ? null : countList.array;
|
jpayne@68
|
582 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
|
jpayne@68
|
583 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
|
jpayne@68
|
584 sketch.spid=spid;
|
jpayne@68
|
585 sketches.add(sketch);
|
jpayne@68
|
586 }
|
jpayne@68
|
587 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");}
|
jpayne@68
|
588 return sketches;
|
jpayne@68
|
589 }
|
jpayne@68
|
590
|
jpayne@68
|
591 /** Usually much faster due to not manifesting the multithreaded load Java slowdown. Should incur less garbage collection also. */
|
jpayne@68
|
592 private ArrayList<Sketch> loadSketchesFromSketchFile2(final FileFormat ff, boolean allowZeroSizeSketch){
|
jpayne@68
|
593
|
jpayne@68
|
594 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
|
jpayne@68
|
595
|
jpayne@68
|
596 if(verbose2){System.err.println("Loading sketches from text.");}
|
jpayne@68
|
597 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
|
jpayne@68
|
598
|
jpayne@68
|
599 InputStream is=ReadWrite.getInputStream(ff.name(), BUFFERED_READER, false);
|
jpayne@68
|
600 byte[] buffer=new byte[BUFLEN];
|
jpayne@68
|
601 int start, limit=0;
|
jpayne@68
|
602 try {
|
jpayne@68
|
603 limit=is.read(buffer);
|
jpayne@68
|
604 } catch (IOException e) {
|
jpayne@68
|
605 KillSwitch.exceptionKill(e);
|
jpayne@68
|
606 }
|
jpayne@68
|
607
|
jpayne@68
|
608
|
jpayne@68
|
609 int currentSketchSize=stTargetSketchSize;
|
jpayne@68
|
610 int taxID=-1;
|
jpayne@68
|
611 long spid=-1;
|
jpayne@68
|
612 long imgID=-1;
|
jpayne@68
|
613 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
|
jpayne@68
|
614 long[] baseCounts=null;
|
jpayne@68
|
615 byte[] r16S=null;
|
jpayne@68
|
616 byte[] r18S=null;
|
jpayne@68
|
617 int r16SLen=0;
|
jpayne@68
|
618 int r18SLen=0;
|
jpayne@68
|
619 float probCorrect=-1;
|
jpayne@68
|
620 String name=null, name0=null, fname=ff.simpleName();
|
jpayne@68
|
621 LongList list=null;
|
jpayne@68
|
622 IntList countList=null;
|
jpayne@68
|
623 ArrayList<String> meta=null;
|
jpayne@68
|
624 long sum=0;
|
jpayne@68
|
625 byte[] lastHeader=null;
|
jpayne@68
|
626 ByteBuilder bb=new ByteBuilder(256);
|
jpayne@68
|
627
|
jpayne@68
|
628 for(start=0; start<limit;){
|
jpayne@68
|
629 // System.err.println("Processing line "+new String(line));
|
jpayne@68
|
630 if(buffer[start]=='#'){
|
jpayne@68
|
631 bb.clear();
|
jpayne@68
|
632 try {
|
jpayne@68
|
633 while(buffer[start]!='\n'){
|
jpayne@68
|
634 bb.append(buffer[start]);
|
jpayne@68
|
635 start++;
|
jpayne@68
|
636 if(start>=limit){start=0; limit=is.read(buffer);}
|
jpayne@68
|
637 }
|
jpayne@68
|
638 } catch (IOException e) {
|
jpayne@68
|
639 KillSwitch.exceptionKill(e);
|
jpayne@68
|
640 }
|
jpayne@68
|
641 start++;
|
jpayne@68
|
642
|
jpayne@68
|
643 if(r16SLen>0 || r18SLen>0){
|
jpayne@68
|
644 byte[] ssu=bb.toBytes(5, bb.length());
|
jpayne@68
|
645 if(bb.startsWith("#16S:") || bb.startsWith("#SSU:")){
|
jpayne@68
|
646 assert(r16SLen>0);
|
jpayne@68
|
647 assert(ssu.length==r16SLen) : r16SLen+", "+bb.length+"\n"+bb+"\n";
|
jpayne@68
|
648 r16S=ssu;
|
jpayne@68
|
649 r16SLen=0;
|
jpayne@68
|
650 }else if(bb.startsWith("#18S:")){
|
jpayne@68
|
651 assert(r18SLen>0);
|
jpayne@68
|
652 assert(ssu.length==r18SLen) : r18SLen+", "+bb.length+"\n"+bb+"\n";
|
jpayne@68
|
653 r18S=ssu;
|
jpayne@68
|
654 r18SLen=0;
|
jpayne@68
|
655 }else{
|
jpayne@68
|
656 assert(false) : bb;
|
jpayne@68
|
657 }
|
jpayne@68
|
658 }else{
|
jpayne@68
|
659
|
jpayne@68
|
660 // byte[] line=lastHeader=bb.toBytes();
|
jpayne@68
|
661 if(list!=null){
|
jpayne@68
|
662
|
jpayne@68
|
663 //This assertion fails sometimes for Silva per-sequence mode, but it's not important
|
jpayne@68
|
664 // assert(list.size==list.array.length) : list.size+", "+list.array.length+(lastHeader==null ? "" : ", "+new String(lastHeader));
|
jpayne@68
|
665
|
jpayne@68
|
666 if(NUC || unsorted){
|
jpayne@68
|
667 list.sort();
|
jpayne@68
|
668 list.shrinkToUnique();
|
jpayne@68
|
669 }else{
|
jpayne@68
|
670 list.shrink();
|
jpayne@68
|
671 }
|
jpayne@68
|
672 if(list.size()>0 || allowZeroSizeSketch){
|
jpayne@68
|
673 int[] keyCounts=countList==null ? null : countList.array;
|
jpayne@68
|
674 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
|
jpayne@68
|
675 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
|
jpayne@68
|
676 sketch.spid=spid;
|
jpayne@68
|
677 sketches.add(sketch);
|
jpayne@68
|
678 }
|
jpayne@68
|
679 // System.err.println("Made sketch "+sketch);
|
jpayne@68
|
680 }
|
jpayne@68
|
681 name=name0=null;
|
jpayne@68
|
682 fname=ff.simpleName();
|
jpayne@68
|
683 list=null;
|
jpayne@68
|
684 countList=null;
|
jpayne@68
|
685 meta=null;
|
jpayne@68
|
686 baseCounts=null;
|
jpayne@68
|
687 r16S=null;
|
jpayne@68
|
688 r18S=null;
|
jpayne@68
|
689 r16SLen=0;
|
jpayne@68
|
690 r18SLen=0;
|
jpayne@68
|
691 sum=0;
|
jpayne@68
|
692 taxID=-1;
|
jpayne@68
|
693 imgID=-1;
|
jpayne@68
|
694 genomeSizeBases=0;
|
jpayne@68
|
695 genomeSizeKmers=0;
|
jpayne@68
|
696 genomeSequences=0;
|
jpayne@68
|
697 probCorrect=-1;
|
jpayne@68
|
698 int k_sketch=defaultK;
|
jpayne@68
|
699 int k2_sketch=0;
|
jpayne@68
|
700 int hashVersion_sketch=1;
|
jpayne@68
|
701
|
jpayne@68
|
702 if(bb.length>1){
|
jpayne@68
|
703 String[] split=new String(bb.array, 1, bb.length-1).split("\t");
|
jpayne@68
|
704 for(String s : split){
|
jpayne@68
|
705 final int colon=s.indexOf(':');
|
jpayne@68
|
706 final String sub=s.substring(colon+1);
|
jpayne@68
|
707 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
|
jpayne@68
|
708 currentSketchSize=Integer.parseInt(sub);
|
jpayne@68
|
709 }else if(s.startsWith("CD:")){//Coding
|
jpayne@68
|
710 A48=HEX=NUC=delta=counts=unsorted=false;
|
jpayne@68
|
711
|
jpayne@68
|
712 for(int i=0; i<sub.length(); i++){
|
jpayne@68
|
713 char c=sub.charAt(i);
|
jpayne@68
|
714 if(c=='A'){A48=true;}
|
jpayne@68
|
715 else if(c=='H'){HEX=true;}
|
jpayne@68
|
716 else if(c=='R'){A48=HEX=false;}
|
jpayne@68
|
717 else if(c=='N'){NUC=true;}
|
jpayne@68
|
718 else if(c=='D'){delta=true;}
|
jpayne@68
|
719 else if(c=='C'){counts=true;}
|
jpayne@68
|
720 else if(c=='U'){unsorted=true;}
|
jpayne@68
|
721 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+bb;}
|
jpayne@68
|
722 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+bb;}
|
jpayne@68
|
723 else{assert(false) : "Unknown coding symbol: "+c+"\t"+bb;}
|
jpayne@68
|
724 }
|
jpayne@68
|
725 }else if(s.startsWith("K:")){//Kmer length
|
jpayne@68
|
726 if(sub.indexOf(',')>=0){
|
jpayne@68
|
727 String[] subsplit=sub.split(",");
|
jpayne@68
|
728 assert(subsplit.length==2) : "Bad header component "+s;
|
jpayne@68
|
729 int x=Integer.parseInt(subsplit[0]);
|
jpayne@68
|
730 int y=Integer.parseInt(subsplit[1]);
|
jpayne@68
|
731 k_sketch=Tools.max(x, y);
|
jpayne@68
|
732 k2_sketch=Tools.min(x, y);
|
jpayne@68
|
733 }else{
|
jpayne@68
|
734 k_sketch=Integer.parseInt(sub);
|
jpayne@68
|
735 k2_sketch=0;
|
jpayne@68
|
736 }
|
jpayne@68
|
737 }else if(s.startsWith("H:")){//Hash version
|
jpayne@68
|
738 hashVersion_sketch=Integer.parseInt(sub);
|
jpayne@68
|
739 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
|
jpayne@68
|
740 baseCounts=Parse.parseLongArray(sub);
|
jpayne@68
|
741 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
|
jpayne@68
|
742 genomeSizeBases=Long.parseLong(sub);
|
jpayne@68
|
743 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
|
jpayne@68
|
744 genomeSizeKmers=Long.parseLong(sub);
|
jpayne@68
|
745 }else if(s.startsWith("GQ:")){
|
jpayne@68
|
746 genomeSequences=Long.parseLong(sub);
|
jpayne@68
|
747 }else if(s.startsWith("GE:")){//Genome size estimate kmers
|
jpayne@68
|
748 //ignore
|
jpayne@68
|
749 }else if(s.startsWith("PC:")){//Probability of correctness
|
jpayne@68
|
750 probCorrect=Float.parseFloat(sub);
|
jpayne@68
|
751 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
|
jpayne@68
|
752 taxID=Integer.parseInt(sub);
|
jpayne@68
|
753 }else if(s.startsWith("IMG:")){
|
jpayne@68
|
754 imgID=Long.parseLong(sub);
|
jpayne@68
|
755 }else if(s.startsWith("SPID:")){
|
jpayne@68
|
756 spid=Integer.parseInt(sub);
|
jpayne@68
|
757 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
|
jpayne@68
|
758 name=sub;
|
jpayne@68
|
759 }else if(s.startsWith("FN:")){
|
jpayne@68
|
760 fname=sub;
|
jpayne@68
|
761 }else if(s.startsWith("NM0:")){
|
jpayne@68
|
762 name0=sub;
|
jpayne@68
|
763 }else if(s.startsWith("MT_")){
|
jpayne@68
|
764 if(meta==null){meta=new ArrayList<String>(1);}
|
jpayne@68
|
765 meta.add(s.substring(3));
|
jpayne@68
|
766 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
|
jpayne@68
|
767 r16SLen=Integer.parseInt(sub);
|
jpayne@68
|
768 }else if(s.startsWith("18S:")){
|
jpayne@68
|
769 r18SLen=Integer.parseInt(sub);
|
jpayne@68
|
770 }else{
|
jpayne@68
|
771 assert(false) : "Unsupported header tag "+s;
|
jpayne@68
|
772 }
|
jpayne@68
|
773 }
|
jpayne@68
|
774 }
|
jpayne@68
|
775
|
jpayne@68
|
776 if(KILL_OK){
|
jpayne@68
|
777 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+
|
jpayne@68
|
778 " differs from loaded kmer length "+k+"\n"+bb+"\nfile: "+ff.name());}
|
jpayne@68
|
779 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+
|
jpayne@68
|
780 " differs from loaded kmer length "+k+","+k2+"\n"+bb+"\nfile: "+ff.name());}
|
jpayne@68
|
781 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
|
jpayne@68
|
782 " differs from loaded hash version "+HASH_VERSION+".\n"
|
jpayne@68
|
783 + "You may need to download the latest version of BBTools.\nfile: "+ff.name());}
|
jpayne@68
|
784 }else{//Potential hang
|
jpayne@68
|
785 assert(k_sketch==k || NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+bb;
|
jpayne@68
|
786 assert(k2_sketch==k2 || NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+bb;
|
jpayne@68
|
787 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
|
jpayne@68
|
788 " differs from loaded hash version "+HASH_VERSION+".\n"
|
jpayne@68
|
789 + "You may need to download the latest version of BBTools.\nfile: "+ff.name();
|
jpayne@68
|
790 }
|
jpayne@68
|
791 if(currentSketchSize>0 || allowZeroSizeSketch){
|
jpayne@68
|
792 list=new LongList(Tools.max(1, currentSketchSize));
|
jpayne@68
|
793 if(counts){countList=new IntList(Tools.max(1, currentSketchSize));}
|
jpayne@68
|
794 }
|
jpayne@68
|
795 }
|
jpayne@68
|
796 }else{
|
jpayne@68
|
797 bb.clear();
|
jpayne@68
|
798 try {
|
jpayne@68
|
799 while(buffer[start]!='\n'){
|
jpayne@68
|
800 bb.append(buffer[start]);
|
jpayne@68
|
801 start++;
|
jpayne@68
|
802 if(start>=limit){start=0; limit=is.read(buffer);}
|
jpayne@68
|
803 }
|
jpayne@68
|
804 } catch (IOException e) {
|
jpayne@68
|
805 KillSwitch.exceptionKill(e);
|
jpayne@68
|
806 }
|
jpayne@68
|
807 start++;
|
jpayne@68
|
808
|
jpayne@68
|
809 long x=(counts ? Sketch.parseA48C(bb, countList) : A48 ? Sketch.parseA48(bb) :
|
jpayne@68
|
810 HEX ? Sketch.parseHex(bb) : NUC ? Sketch.parseNuc(bb) : Parse.parseLong(bb.array, 0, bb.length));
|
jpayne@68
|
811 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
|
jpayne@68
|
812 sum+=x;
|
jpayne@68
|
813 assert(x>=0 || NUC) : "x="+x+"\nline="+bb+"\nheader="+(lastHeader==null ? "null" : new String(lastHeader))+"\n";
|
jpayne@68
|
814 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
|
jpayne@68
|
815 assert(list!=null) : bb;
|
jpayne@68
|
816 long key=(delta ? sum : x);
|
jpayne@68
|
817 if(key>=0){list.add(key);}
|
jpayne@68
|
818 }
|
jpayne@68
|
819
|
jpayne@68
|
820 if(start>=limit){
|
jpayne@68
|
821 start=0;
|
jpayne@68
|
822 try {
|
jpayne@68
|
823 limit=is.read(buffer);
|
jpayne@68
|
824 } catch (IOException e) {
|
jpayne@68
|
825 KillSwitch.exceptionKill(e);
|
jpayne@68
|
826 }
|
jpayne@68
|
827 }
|
jpayne@68
|
828 }
|
jpayne@68
|
829 if(list!=null && (list.size>0 || allowZeroSizeSketch)){
|
jpayne@68
|
830
|
jpayne@68
|
831 //This assertion fails sometimes for Silva per-sequence mode, but it's not important
|
jpayne@68
|
832 // assert(list.size==list.array.length || NUC || unsorted || (allowZeroSizeSketch && list.size==0)) :
|
jpayne@68
|
833 // list.size+"!="+list.array.length+(lastHeader==null ? "" : "\n"+new String(lastHeader));
|
jpayne@68
|
834
|
jpayne@68
|
835 if(NUC || unsorted){
|
jpayne@68
|
836 list.sort();
|
jpayne@68
|
837 list.shrinkToUnique();
|
jpayne@68
|
838 }else{
|
jpayne@68
|
839 list.shrink();
|
jpayne@68
|
840 }
|
jpayne@68
|
841 int[] keyCounts=countList==null ? null : countList.array;
|
jpayne@68
|
842 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
|
jpayne@68
|
843 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
|
jpayne@68
|
844 sketch.spid=spid;
|
jpayne@68
|
845 sketches.add(sketch);
|
jpayne@68
|
846 }
|
jpayne@68
|
847 if(verbose2){System.err.println("Loaded "+sketches.size()+" sketches from text.");}
|
jpayne@68
|
848 try {
|
jpayne@68
|
849 is.close();
|
jpayne@68
|
850 } catch (IOException e) {
|
jpayne@68
|
851 e.printStackTrace();
|
jpayne@68
|
852 }
|
jpayne@68
|
853 return sketches;
|
jpayne@68
|
854 }
|
jpayne@68
|
855
|
jpayne@68
|
856 /*--------------------------------------------------------------*/
|
jpayne@68
|
857
|
jpayne@68
|
858 public ArrayList<Sketch> loadSketchesFromString(String sketchString){
|
jpayne@68
|
859 boolean A48=(Sketch.CODING==Sketch.A48), HEX=(Sketch.CODING==Sketch.HEX), NUC=false, delta=true, counts=false, unsorted=false;
|
jpayne@68
|
860
|
jpayne@68
|
861 ArrayList<Sketch> sketches=new ArrayList<Sketch>();
|
jpayne@68
|
862 int currentSketchSize=stTargetSketchSize;
|
jpayne@68
|
863 int taxID=-1;
|
jpayne@68
|
864 long spid=-1;
|
jpayne@68
|
865 long imgID=-1;
|
jpayne@68
|
866 long genomeSizeBases=0, genomeSizeKmers=0, genomeSequences=0;
|
jpayne@68
|
867 long[] baseCounts=null;
|
jpayne@68
|
868 byte[] r16S=null;
|
jpayne@68
|
869 byte[] r18S=null;
|
jpayne@68
|
870 int r16SLen=0;
|
jpayne@68
|
871 int r18SLen=0;
|
jpayne@68
|
872 float probCorrect=-1;
|
jpayne@68
|
873 String name=null, name0=null, fname=null;
|
jpayne@68
|
874 LongList list=null;
|
jpayne@68
|
875 IntList countList=null;
|
jpayne@68
|
876 ArrayList<String> meta=null;
|
jpayne@68
|
877 long sum=0;
|
jpayne@68
|
878
|
jpayne@68
|
879 String[] split0=sketchString.split("\n");
|
jpayne@68
|
880 for(String line : split0){
|
jpayne@68
|
881 if(line.length()>0){
|
jpayne@68
|
882 // System.err.println("Processing line "+new String(line));
|
jpayne@68
|
883 if(line.charAt(0)=='#'){
|
jpayne@68
|
884 if(line.length()>1 && line.charAt(1)=='#'){
|
jpayne@68
|
885 //ignore
|
jpayne@68
|
886 }else if(r16SLen>0 || r18SLen>0){
|
jpayne@68
|
887 byte[] ssu=KillSwitch.copyOfRange(line.getBytes(), 5, line.length());
|
jpayne@68
|
888 if(line.startsWith("#16S:") || line.startsWith("#SSU:")){
|
jpayne@68
|
889 assert(r16SLen>0);
|
jpayne@68
|
890 assert(ssu.length==r16SLen) : r16SLen+", "+line.length()+"\n"+line+"\n";
|
jpayne@68
|
891 r16S=ssu;
|
jpayne@68
|
892 r16SLen=0;
|
jpayne@68
|
893 }else if(line.startsWith("#18S:")){
|
jpayne@68
|
894 assert(r18SLen>0);
|
jpayne@68
|
895 assert(ssu.length==r18SLen) : r18SLen+", "+line.length()+"\n"+line+"\n";
|
jpayne@68
|
896 r18S=ssu;
|
jpayne@68
|
897 r18SLen=0;
|
jpayne@68
|
898 }else{
|
jpayne@68
|
899 assert(false) : line;
|
jpayne@68
|
900 }
|
jpayne@68
|
901 }else{
|
jpayne@68
|
902 if(list!=null){
|
jpayne@68
|
903 assert(list.size==list.array.length);
|
jpayne@68
|
904 if(NUC || unsorted){
|
jpayne@68
|
905 list.sort();
|
jpayne@68
|
906 list.shrinkToUnique();
|
jpayne@68
|
907 }else{
|
jpayne@68
|
908 list.shrink();
|
jpayne@68
|
909 }
|
jpayne@68
|
910 if(list.size()>0){
|
jpayne@68
|
911 int[] keyCounts=countList==null ? null : countList.array;
|
jpayne@68
|
912 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
|
jpayne@68
|
913 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
|
jpayne@68
|
914 sketch.spid=spid;
|
jpayne@68
|
915 sketches.add(sketch);
|
jpayne@68
|
916 }
|
jpayne@68
|
917 // System.err.println("Made sketch "+sketch);
|
jpayne@68
|
918 }
|
jpayne@68
|
919 name=name0=null;
|
jpayne@68
|
920 fname=null;
|
jpayne@68
|
921 list=null;
|
jpayne@68
|
922 countList=null;
|
jpayne@68
|
923 meta=null;
|
jpayne@68
|
924 baseCounts=null;
|
jpayne@68
|
925 r16S=null;
|
jpayne@68
|
926 r18S=null;
|
jpayne@68
|
927 r16SLen=0;
|
jpayne@68
|
928 r18SLen=0;
|
jpayne@68
|
929 sum=0;
|
jpayne@68
|
930 taxID=-1;
|
jpayne@68
|
931 imgID=-1;
|
jpayne@68
|
932 genomeSizeBases=0;
|
jpayne@68
|
933 genomeSizeKmers=0;
|
jpayne@68
|
934 genomeSequences=0;
|
jpayne@68
|
935 probCorrect=-1;
|
jpayne@68
|
936 int k_sketch=defaultK;
|
jpayne@68
|
937 int k2_sketch=0;
|
jpayne@68
|
938 int hashVersion_sketch=1;
|
jpayne@68
|
939
|
jpayne@68
|
940 if(line.length()>1){
|
jpayne@68
|
941 String[] split=line.substring(1).split("\t");
|
jpayne@68
|
942 for(String s : split){
|
jpayne@68
|
943 final int colon=s.indexOf(':');
|
jpayne@68
|
944 final String sub=s.substring(colon+1);
|
jpayne@68
|
945 if(s.startsWith("SZ:") || s.startsWith("SIZE:")){//Sketch length
|
jpayne@68
|
946 currentSketchSize=Integer.parseInt(sub);
|
jpayne@68
|
947 }else if(s.startsWith("CD:")){//Coding
|
jpayne@68
|
948 A48=HEX=NUC=delta=counts=false;
|
jpayne@68
|
949
|
jpayne@68
|
950 for(int i=0; i<sub.length(); i++){
|
jpayne@68
|
951 char c=sub.charAt(i);
|
jpayne@68
|
952 if(c=='A'){A48=true;}
|
jpayne@68
|
953 else if(c=='H'){HEX=true;}
|
jpayne@68
|
954 else if(c=='R'){A48=HEX=false;}
|
jpayne@68
|
955 else if(c=='N'){NUC=true;}
|
jpayne@68
|
956 else if(c=='D'){delta=true;}
|
jpayne@68
|
957 else if(c=='C'){counts=true;}
|
jpayne@68
|
958 else if(c=='U'){unsorted=true;}
|
jpayne@68
|
959 else if(c=='M'){assert(aminoOrTranslate()) : "Amino sketch in non-amino mode: "+new String(line);}
|
jpayne@68
|
960 else if(c=='8'){assert(amino8) : "Amino8 sketch in non-amino8 mode: "+new String(line);}
|
jpayne@68
|
961 else{assert(false) : "Unknown coding symbol: "+c+"\t"+new String(line);}
|
jpayne@68
|
962 }
|
jpayne@68
|
963
|
jpayne@68
|
964 }else if(s.startsWith("K:")){//Kmer length
|
jpayne@68
|
965 if(sub.indexOf(',')>=0){
|
jpayne@68
|
966 String[] subsplit=sub.split(",");
|
jpayne@68
|
967 assert(subsplit.length==2) : "Bad header component "+s;
|
jpayne@68
|
968 int x=Integer.parseInt(subsplit[0]);
|
jpayne@68
|
969 int y=Integer.parseInt(subsplit[1]);
|
jpayne@68
|
970 k_sketch=Tools.max(x, y);
|
jpayne@68
|
971 k2_sketch=Tools.min(x, y);
|
jpayne@68
|
972 }else{
|
jpayne@68
|
973 k_sketch=Integer.parseInt(s);
|
jpayne@68
|
974 k2_sketch=0;
|
jpayne@68
|
975 }
|
jpayne@68
|
976 }else if(s.startsWith("H:")){//Hash version
|
jpayne@68
|
977 hashVersion_sketch=Integer.parseInt(sub);
|
jpayne@68
|
978 }else if(s.startsWith("BC:") || s.startsWith("BASECOUNTS:")){//ACGTN
|
jpayne@68
|
979 baseCounts=Parse.parseLongArray(sub);
|
jpayne@68
|
980 }else if(s.startsWith("GS:") || s.startsWith("GSIZE:")){//Genomic bases
|
jpayne@68
|
981 genomeSizeBases=Long.parseLong(sub);
|
jpayne@68
|
982 }else if(s.startsWith("GK:") || s.startsWith("GKMERS:")){//Genomic kmers
|
jpayne@68
|
983 genomeSizeKmers=Long.parseLong(sub);
|
jpayne@68
|
984 }else if(s.startsWith("GQ:")){
|
jpayne@68
|
985 genomeSequences=Long.parseLong(sub);
|
jpayne@68
|
986 }else if(s.startsWith("GE:")){//Genome size estimate kmers
|
jpayne@68
|
987 //ignore
|
jpayne@68
|
988 }else if(s.startsWith("PC:")){//Probability of correctness
|
jpayne@68
|
989 probCorrect=Float.parseFloat(sub);
|
jpayne@68
|
990 }else if(s.startsWith("ID:") || s.startsWith("TAXID:")){
|
jpayne@68
|
991 taxID=Integer.parseInt(sub);
|
jpayne@68
|
992 }else if(s.startsWith("IMG:")){
|
jpayne@68
|
993 imgID=Long.parseLong(sub);
|
jpayne@68
|
994 }else if(s.startsWith("SPID:")){
|
jpayne@68
|
995 spid=Integer.parseInt(sub);
|
jpayne@68
|
996 }else if(s.startsWith("NM:") || s.startsWith("NAME:")){
|
jpayne@68
|
997 name=sub;
|
jpayne@68
|
998 }else if(s.startsWith("FN:")){
|
jpayne@68
|
999 fname=sub;
|
jpayne@68
|
1000 }else if(s.startsWith("NM0:")){
|
jpayne@68
|
1001 name0=sub;
|
jpayne@68
|
1002 }else if(s.startsWith("MT_")){
|
jpayne@68
|
1003 if(meta==null){meta=new ArrayList<String>(1);}
|
jpayne@68
|
1004 meta.add(s.substring(3));
|
jpayne@68
|
1005 }else if(s.startsWith("16S:") || s.startsWith("SSU:")){
|
jpayne@68
|
1006 r16SLen=Integer.parseInt(sub);
|
jpayne@68
|
1007 }else if(s.startsWith("18S:")){
|
jpayne@68
|
1008 r18SLen=Integer.parseInt(sub);
|
jpayne@68
|
1009 }else{
|
jpayne@68
|
1010 assert(false) : "Unsupported header tag "+s;
|
jpayne@68
|
1011 }
|
jpayne@68
|
1012 }
|
jpayne@68
|
1013 }
|
jpayne@68
|
1014 if(KILL_OK){
|
jpayne@68
|
1015 if(k_sketch!=k && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line));}
|
jpayne@68
|
1016 if(k2_sketch!=k2 && !NUC){KillSwitch.kill("Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line));}
|
jpayne@68
|
1017 if(hashVersion_sketch!=HASH_VERSION && !NUC){KillSwitch.kill("Sketch hash version "+hashVersion_sketch+
|
jpayne@68
|
1018 " differs from loaded hash version "+HASH_VERSION+".\n"
|
jpayne@68
|
1019 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n");}
|
jpayne@68
|
1020 }else{//Potential hang
|
jpayne@68
|
1021 assert(k_sketch==k && !NUC) : "Sketch kmer length "+k_sketch+" differs from loaded kmer length "+k+"\n"+new String(line);
|
jpayne@68
|
1022 assert(k2_sketch==k2 && !NUC) : "Sketch kmer length "+k_sketch+","+k2_sketch+" differs from loaded kmer length "+k+","+k2+"\n"+new String(line);
|
jpayne@68
|
1023 assert(hashVersion_sketch==HASH_VERSION || NUC) : "Sketch hash version "+hashVersion_sketch+
|
jpayne@68
|
1024 " differs from loaded hash version "+HASH_VERSION+".\n"
|
jpayne@68
|
1025 + "You may need to download the latest version of BBTools.\n"+new String(line)+"\n";
|
jpayne@68
|
1026 }
|
jpayne@68
|
1027
|
jpayne@68
|
1028
|
jpayne@68
|
1029 if(currentSketchSize>0){
|
jpayne@68
|
1030 list=new LongList(currentSketchSize);
|
jpayne@68
|
1031 if(counts){countList=new IntList(currentSketchSize);}
|
jpayne@68
|
1032 }
|
jpayne@68
|
1033 }
|
jpayne@68
|
1034 }else{
|
jpayne@68
|
1035 long x=(counts ? Sketch.parseA48C(line, countList) : A48 ? Sketch.parseA48(line) :
|
jpayne@68
|
1036 HEX ? Sketch.parseHex(line) : NUC ? Sketch.parseNuc(line) : Long.parseLong(line));
|
jpayne@68
|
1037 // System.err.println("sum="+sum+", x="+x+" -> "+(sum+x));
|
jpayne@68
|
1038 sum+=x;
|
jpayne@68
|
1039 assert(x>=0 || NUC) : x+"\n"+new String(line);
|
jpayne@68
|
1040 assert(sum>=0 || !delta) : "The sketch was made with delta compression off. Please regenerate it.";
|
jpayne@68
|
1041 long key=(delta ? sum : x);
|
jpayne@68
|
1042 if(key>=0){list.add(key);}
|
jpayne@68
|
1043 }
|
jpayne@68
|
1044 }
|
jpayne@68
|
1045 }
|
jpayne@68
|
1046
|
jpayne@68
|
1047 if(list!=null){
|
jpayne@68
|
1048 assert(list.size==list.array.length || list.size()==0 || NUC || unsorted);
|
jpayne@68
|
1049 if(NUC || unsorted){
|
jpayne@68
|
1050 list.sort();
|
jpayne@68
|
1051 list.shrinkToUnique();
|
jpayne@68
|
1052 }else{
|
jpayne@68
|
1053 list.shrink();
|
jpayne@68
|
1054 }
|
jpayne@68
|
1055 int[] keyCounts=countList==null ? null : countList.array;
|
jpayne@68
|
1056 Sketch sketch=new Sketch(list.array, keyCounts, baseCounts, r16S, r18S, taxID, imgID,
|
jpayne@68
|
1057 genomeSizeBases, genomeSizeKmers, genomeSequences, probCorrect, name, name0, fname, meta);
|
jpayne@68
|
1058 sketch.spid=spid;
|
jpayne@68
|
1059 sketches.add(sketch);
|
jpayne@68
|
1060 }
|
jpayne@68
|
1061 return sketches;
|
jpayne@68
|
1062 }
|
jpayne@68
|
1063
|
jpayne@68
|
1064 /*--------------------------------------------------------------*/
|
jpayne@68
|
1065 /*---------------- Threads ----------------*/
|
jpayne@68
|
1066 /*--------------------------------------------------------------*/
|
jpayne@68
|
1067
|
jpayne@68
|
1068
|
jpayne@68
|
1069
|
jpayne@68
|
1070 private class LoadThread extends Thread{
|
jpayne@68
|
1071
|
jpayne@68
|
1072 public LoadThread(ConcurrentLinkedQueue<StringNum> queue_, int mode_, float samplerate_, long reads_, float minEntropy, float minProb, byte minQual) {
|
jpayne@68
|
1073 queue=queue_;
|
jpayne@68
|
1074 list=new ArrayList<Sketch>();
|
jpayne@68
|
1075 smm=new SketchMakerMini(SketchTool.this, mode_, minEntropy, minProb, minQual);
|
jpayne@68
|
1076 samplerate=samplerate_;
|
jpayne@68
|
1077 reads=reads_;
|
jpayne@68
|
1078 }
|
jpayne@68
|
1079
|
jpayne@68
|
1080 @Override
|
jpayne@68
|
1081 public void run(){
|
jpayne@68
|
1082 success=false;
|
jpayne@68
|
1083 for(StringNum sn=queue.poll(); sn!=null; sn=queue.poll()){
|
jpayne@68
|
1084 ArrayList<Sketch> temp=null;
|
jpayne@68
|
1085 try {
|
jpayne@68
|
1086 temp=loadSketchesFromFile(sn.s, smm, 1, reads, smm.mode, samplerate, smm.minEntropy(), smm.minProb(), smm.minQual(), false);
|
jpayne@68
|
1087 } catch (Throwable e) {
|
jpayne@68
|
1088 System.err.println("Failure loading "+sn+":\n"+e);
|
jpayne@68
|
1089 e.printStackTrace();
|
jpayne@68
|
1090 success=false;
|
jpayne@68
|
1091 }
|
jpayne@68
|
1092 if(temp!=null && temp.size()>0){
|
jpayne@68
|
1093 if(smm.mode==PER_FILE){
|
jpayne@68
|
1094 // assert(temp.size()==1) : temp.size();
|
jpayne@68
|
1095 temp.get(0).sketchID=(int)sn.n;
|
jpayne@68
|
1096 }
|
jpayne@68
|
1097 for(Sketch s : temp){add(s);}
|
jpayne@68
|
1098 }
|
jpayne@68
|
1099 }
|
jpayne@68
|
1100 success=true;
|
jpayne@68
|
1101 }
|
jpayne@68
|
1102
|
jpayne@68
|
1103 private void add(Sketch s){
|
jpayne@68
|
1104 if(list!=null){
|
jpayne@68
|
1105 list.add(s);
|
jpayne@68
|
1106 return;
|
jpayne@68
|
1107 }
|
jpayne@68
|
1108 assert(false) : "Unsupported."; //The map logic is broken; needs to be synchronized.
|
jpayne@68
|
1109 // if(s.taxID<0){return;}
|
jpayne@68
|
1110 //// assert(s.taxID>-1) : s.toHeader();
|
jpayne@68
|
1111 // TaxNode tn=tree.getNode(s.taxID);
|
jpayne@68
|
1112 // while(tn!=null && tn.pid!=tn.id && tn.level<taxLevel){
|
jpayne@68
|
1113 // TaxNode temp=tree.getNode(tn.pid);
|
jpayne@68
|
1114 // if(temp==null){break;}
|
jpayne@68
|
1115 // tn=temp;
|
jpayne@68
|
1116 // }
|
jpayne@68
|
1117 // if(tn==null){return;}
|
jpayne@68
|
1118 // Integer key=tn.id;
|
jpayne@68
|
1119 // Sketch old=map.get(key);
|
jpayne@68
|
1120 // if(old==null){
|
jpayne@68
|
1121 // s.taxID=key;
|
jpayne@68
|
1122 // map.put(key, s);
|
jpayne@68
|
1123 // }else{
|
jpayne@68
|
1124 // synchronized(old){
|
jpayne@68
|
1125 // old.add(s, maxLen);
|
jpayne@68
|
1126 // }
|
jpayne@68
|
1127 // }
|
jpayne@68
|
1128 }
|
jpayne@68
|
1129
|
jpayne@68
|
1130 final ConcurrentLinkedQueue<StringNum> queue;
|
jpayne@68
|
1131 ArrayList<Sketch> list;
|
jpayne@68
|
1132 boolean success=false;
|
jpayne@68
|
1133 final SketchMakerMini smm;
|
jpayne@68
|
1134 final float samplerate;
|
jpayne@68
|
1135 final long reads;
|
jpayne@68
|
1136
|
jpayne@68
|
1137 // ConcurrentHashMap<Integer, Sketch> map;
|
jpayne@68
|
1138
|
jpayne@68
|
1139 }
|
jpayne@68
|
1140
|
jpayne@68
|
1141 private class LoadThread2 extends Thread{
|
jpayne@68
|
1142
|
jpayne@68
|
1143 LoadThread2(ConcurrentReadInputStream cris_, float minEntropy, float minProb, byte minQual){
|
jpayne@68
|
1144 cris=cris_;
|
jpayne@68
|
1145 smm=new SketchMakerMini(SketchTool.this, ONE_SKETCH, minEntropy, minProb, minQual);
|
jpayne@68
|
1146 }
|
jpayne@68
|
1147
|
jpayne@68
|
1148 @Override
|
jpayne@68
|
1149 public void run(){
|
jpayne@68
|
1150
|
jpayne@68
|
1151 //Grab the first ListNum of reads
|
jpayne@68
|
1152 ListNum<Read> ln=cris.nextList();
|
jpayne@68
|
1153 //Grab the actual read list from the ListNum
|
jpayne@68
|
1154 ArrayList<Read> reads=(ln!=null ? ln.list : null);
|
jpayne@68
|
1155
|
jpayne@68
|
1156 //As long as there is a nonempty read list...
|
jpayne@68
|
1157 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
|
jpayne@68
|
1158
|
jpayne@68
|
1159 //Loop through each read in the list
|
jpayne@68
|
1160 for(int idx=0; idx<reads.size(); idx++){
|
jpayne@68
|
1161 final Read r1=reads.get(idx);
|
jpayne@68
|
1162 final Read r2=r1.mate;
|
jpayne@68
|
1163
|
jpayne@68
|
1164 if(validate){
|
jpayne@68
|
1165 if(r1!=null){r1.validate(true);}
|
jpayne@68
|
1166 if(r2!=null){r2.validate(true);}
|
jpayne@68
|
1167 }
|
jpayne@68
|
1168
|
jpayne@68
|
1169 smm.processReadPair(r1, r2);
|
jpayne@68
|
1170 }
|
jpayne@68
|
1171
|
jpayne@68
|
1172 //Notify the input stream that the list was used
|
jpayne@68
|
1173 cris.returnList(ln);
|
jpayne@68
|
1174
|
jpayne@68
|
1175 //Fetch a new list
|
jpayne@68
|
1176 ln=cris.nextList();
|
jpayne@68
|
1177 reads=(ln!=null ? ln.list : null);
|
jpayne@68
|
1178 }
|
jpayne@68
|
1179
|
jpayne@68
|
1180 //Notify the input stream that the final list was used
|
jpayne@68
|
1181 if(ln!=null){
|
jpayne@68
|
1182 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
|
jpayne@68
|
1183 }
|
jpayne@68
|
1184 }
|
jpayne@68
|
1185
|
jpayne@68
|
1186 private final boolean validate=!Read.VALIDATE_IN_CONSTRUCTOR;
|
jpayne@68
|
1187 ConcurrentReadInputStream cris;
|
jpayne@68
|
1188 SketchMakerMini smm;
|
jpayne@68
|
1189
|
jpayne@68
|
1190 }
|
jpayne@68
|
1191
|
jpayne@68
|
1192 /** Converts KmerTableSets to Heaps */
|
jpayne@68
|
1193 private class SketchThread extends Thread {
|
jpayne@68
|
1194
|
jpayne@68
|
1195 SketchThread(AtomicInteger next_, KmerTableSet kts_){
|
jpayne@68
|
1196 next=next_;
|
jpayne@68
|
1197 kts=kts_;
|
jpayne@68
|
1198 }
|
jpayne@68
|
1199
|
jpayne@68
|
1200 @Override
|
jpayne@68
|
1201 public void run(){
|
jpayne@68
|
1202 final int ways=kts.ways();
|
jpayne@68
|
1203 int tnum=next.getAndIncrement();
|
jpayne@68
|
1204 while(tnum<ways){
|
jpayne@68
|
1205 HashArray1D table=kts.getTable(tnum);
|
jpayne@68
|
1206 if(stTargetSketchSize>0){
|
jpayne@68
|
1207 if(heap==null){heap=new SketchHeap(stTargetSketchSize, minKeyOccuranceCount, trackCounts);}
|
jpayne@68
|
1208 toHeap(table, heap);
|
jpayne@68
|
1209 }else{
|
jpayne@68
|
1210 if(list==null){list=new LongList();}
|
jpayne@68
|
1211 toList(table, list);
|
jpayne@68
|
1212 }
|
jpayne@68
|
1213 tnum=next.getAndIncrement();
|
jpayne@68
|
1214 }
|
jpayne@68
|
1215 }
|
jpayne@68
|
1216
|
jpayne@68
|
1217 final AtomicInteger next;
|
jpayne@68
|
1218 final KmerTableSet kts;
|
jpayne@68
|
1219 SketchHeap heap;
|
jpayne@68
|
1220 LongList list;
|
jpayne@68
|
1221 }
|
jpayne@68
|
1222
|
jpayne@68
|
1223 /*--------------------------------------------------------------*/
|
jpayne@68
|
1224 /*---------------- Read Loading ----------------*/
|
jpayne@68
|
1225 /*--------------------------------------------------------------*/
|
jpayne@68
|
1226
|
jpayne@68
|
1227 public Sketch processReadsMT(String fname, int maxThreads, long reads, int mode, float samplerate,
|
jpayne@68
|
1228 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
|
jpayne@68
|
1229 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
|
jpayne@68
|
1230 return processReadsMT(fname.replaceFirst("#", "1"), fname.replaceFirst("#", "2"), maxThreads, reads, mode, samplerate,
|
jpayne@68
|
1231 minEntropy, minProb, minQual, allowZeroSizeSketch);
|
jpayne@68
|
1232 }else{
|
jpayne@68
|
1233 return processReadsMT(fname, null, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
|
jpayne@68
|
1234 }
|
jpayne@68
|
1235 }
|
jpayne@68
|
1236
|
jpayne@68
|
1237 public Sketch processReadsMT(String fname1, String fname2, int maxThreads, long reads, int mode, float samplerate,
|
jpayne@68
|
1238 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
|
jpayne@68
|
1239 final FileFormat ffin1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
|
jpayne@68
|
1240 final FileFormat ffin2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
|
jpayne@68
|
1241 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, samplerate, minEntropy, minProb, minQual, allowZeroSizeSketch);
|
jpayne@68
|
1242 }
|
jpayne@68
|
1243
|
jpayne@68
|
1244 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, DisplayParams params, boolean allowZeroSizeSketch){
|
jpayne@68
|
1245 return processReadsMT(ffin1, ffin2, maxThreads, reads, mode, params.samplerate, params.minEntropy, params.minProb, params.minQual, allowZeroSizeSketch);
|
jpayne@68
|
1246 }
|
jpayne@68
|
1247
|
jpayne@68
|
1248 public Sketch processReadsMT(FileFormat ffin1, FileFormat ffin2, int maxThreads, long reads, int mode, float samplerate,
|
jpayne@68
|
1249 float minEntropy, float minProb, byte minQual, boolean allowZeroSizeSketch){
|
jpayne@68
|
1250 assert(mode==ONE_SKETCH || mode==PER_FILE);
|
jpayne@68
|
1251 final boolean compressed=ffin1.compressed();
|
jpayne@68
|
1252
|
jpayne@68
|
1253 maxThreads=Tools.mid(1, maxThreads, Shared.threads());
|
jpayne@68
|
1254
|
jpayne@68
|
1255 //Create a read input stream
|
jpayne@68
|
1256 final ConcurrentReadInputStream cris;
|
jpayne@68
|
1257 String simpleName;
|
jpayne@68
|
1258 {
|
jpayne@68
|
1259 simpleName=ffin1.simpleName();
|
jpayne@68
|
1260 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
|
jpayne@68
|
1261 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
|
jpayne@68
|
1262 cris.start(); //Start the stream
|
jpayne@68
|
1263 // if(verbose){outstream.println("Started cris");}
|
jpayne@68
|
1264 }
|
jpayne@68
|
1265
|
jpayne@68
|
1266 //TODO: bgzip actually decompresses fast.
|
jpayne@68
|
1267 final int threads=(int)Tools.min(maxThreads,
|
jpayne@68
|
1268 1.75f*(compressed ? 1 : 2)*(ffin2==null ? 4 : 8)*(mergePairs ? 3 : minEntropy>0 ? 2 : 1));
|
jpayne@68
|
1269
|
jpayne@68
|
1270 if(verbose2){System.err.println("Starting "+threads+" load threads.");}
|
jpayne@68
|
1271 ArrayList<LoadThread2> list=new ArrayList<LoadThread2>(threads);
|
jpayne@68
|
1272 for(int i=0; i<threads; i++){
|
jpayne@68
|
1273 list.add(new LoadThread2(cris, minEntropy, minProb, minQual));
|
jpayne@68
|
1274 list.get(i).start();
|
jpayne@68
|
1275 }
|
jpayne@68
|
1276
|
jpayne@68
|
1277 ArrayList<SketchHeap> heaps=new ArrayList<SketchHeap>(threads);
|
jpayne@68
|
1278
|
jpayne@68
|
1279 for(LoadThread2 pt : list){
|
jpayne@68
|
1280
|
jpayne@68
|
1281 //Wait until this thread has terminated
|
jpayne@68
|
1282 while(pt.getState()!=Thread.State.TERMINATED){
|
jpayne@68
|
1283 try {
|
jpayne@68
|
1284 //Attempt a join operation
|
jpayne@68
|
1285 pt.join();
|
jpayne@68
|
1286 } catch (InterruptedException e) {
|
jpayne@68
|
1287 //Potentially handle this, if it is expected to occur
|
jpayne@68
|
1288 e.printStackTrace();
|
jpayne@68
|
1289 }
|
jpayne@68
|
1290 }
|
jpayne@68
|
1291
|
jpayne@68
|
1292 if(pt.smm.heap!=null && pt.smm.heap.size()>0){
|
jpayne@68
|
1293 heaps.add(pt.smm.heap);
|
jpayne@68
|
1294 }
|
jpayne@68
|
1295 }
|
jpayne@68
|
1296 list.clear();
|
jpayne@68
|
1297 ReadWrite.closeStream(cris);
|
jpayne@68
|
1298
|
jpayne@68
|
1299 if(verbose2){System.err.println("Generating a sketch by combining thread output.");}
|
jpayne@68
|
1300 Sketch sketch=toSketch(heaps, allowZeroSizeSketch);
|
jpayne@68
|
1301 if(verbose2){System.err.println("Resulting sketch: "+((sketch==null) ? "null" : "length="+sketch.length()));}
|
jpayne@68
|
1302 if(sketch!=null){sketch.setFname(simpleName);}
|
jpayne@68
|
1303 return sketch;
|
jpayne@68
|
1304 }
|
jpayne@68
|
1305
|
jpayne@68
|
1306 /*--------------------------------------------------------------*/
|
jpayne@68
|
1307 /*---------------- Writing ----------------*/
|
jpayne@68
|
1308 /*--------------------------------------------------------------*/
|
jpayne@68
|
1309
|
jpayne@68
|
1310 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff[]){
|
jpayne@68
|
1311 final int len=ff.length;
|
jpayne@68
|
1312 ByteStreamWriter tsw[]=new ByteStreamWriter[len];
|
jpayne@68
|
1313 for(int i=0; i<len; i++){
|
jpayne@68
|
1314 tsw[i]=new ByteStreamWriter(ff[i]);
|
jpayne@68
|
1315 tsw[i].start();
|
jpayne@68
|
1316 }
|
jpayne@68
|
1317 boolean error=false;
|
jpayne@68
|
1318 for(int i=0; i<sketches.size(); i++){
|
jpayne@68
|
1319 write(sketches.get(i), tsw[i%len], new ByteBuilder());
|
jpayne@68
|
1320 }
|
jpayne@68
|
1321 for(int i=0; i<len; i++){
|
jpayne@68
|
1322 error|=tsw[i].poisonAndWait();
|
jpayne@68
|
1323 }
|
jpayne@68
|
1324 return error;
|
jpayne@68
|
1325 }
|
jpayne@68
|
1326
|
jpayne@68
|
1327 public static boolean write(ArrayList<Sketch> sketches, FileFormat ff){
|
jpayne@68
|
1328 final ByteStreamWriter tsw=new ByteStreamWriter(ff);
|
jpayne@68
|
1329 final ByteBuilder bb=new ByteBuilder();
|
jpayne@68
|
1330 tsw.start();
|
jpayne@68
|
1331 for(Sketch sketch : sketches){
|
jpayne@68
|
1332 write(sketch, tsw, bb);
|
jpayne@68
|
1333 }
|
jpayne@68
|
1334 return tsw.poisonAndWait();
|
jpayne@68
|
1335 }
|
jpayne@68
|
1336
|
jpayne@68
|
1337 public static boolean write(Sketch sketch, FileFormat ff){
|
jpayne@68
|
1338 // System.err.println(ff.name()+", "+new File(ff.name()).exists());
|
jpayne@68
|
1339 ByteStreamWriter tsw=new ByteStreamWriter(ff);
|
jpayne@68
|
1340 // assert(false) : new File(ff.name()).exists();
|
jpayne@68
|
1341 tsw.start();
|
jpayne@68
|
1342 write(sketch, tsw, null);
|
jpayne@68
|
1343 return tsw.poisonAndWait();
|
jpayne@68
|
1344 }
|
jpayne@68
|
1345
|
jpayne@68
|
1346 public static void write(Sketch sketch, ByteStreamWriter tsw, ByteBuilder bb){
|
jpayne@68
|
1347 if(bb==null){bb=new ByteBuilder();}
|
jpayne@68
|
1348 else{bb.clear();}
|
jpayne@68
|
1349 sketch.toBytes(bb);
|
jpayne@68
|
1350 tsw.print(bb);
|
jpayne@68
|
1351 }
|
jpayne@68
|
1352
|
jpayne@68
|
1353 /*--------------------------------------------------------------*/
|
jpayne@68
|
1354 /*---------------- Fields ----------------*/
|
jpayne@68
|
1355 /*--------------------------------------------------------------*/
|
jpayne@68
|
1356
|
jpayne@68
|
1357 // final EntropyTracker eTracker;
|
jpayne@68
|
1358 final int stTargetSketchSize;
|
jpayne@68
|
1359 public final int minKeyOccuranceCount;
|
jpayne@68
|
1360 /** Force kmer counts to be tracked. */
|
jpayne@68
|
1361 public final boolean trackCounts;
|
jpayne@68
|
1362 /** Merge reads before processing kmers. */
|
jpayne@68
|
1363 public final boolean mergePairs;
|
jpayne@68
|
1364
|
jpayne@68
|
1365 public static int BUFLEN=16384;
|
jpayne@68
|
1366 public static boolean BUFFERED_READER=false;
|
jpayne@68
|
1367
|
jpayne@68
|
1368 /*--------------------------------------------------------------*/
|
jpayne@68
|
1369 /*---------------- Static Fields ----------------*/
|
jpayne@68
|
1370 /*--------------------------------------------------------------*/
|
jpayne@68
|
1371
|
jpayne@68
|
1372 // public static boolean verbose=false;
|
jpayne@68
|
1373
|
jpayne@68
|
1374 }
|