Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/TableLoaderLockFree.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package kmer; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 import java.util.concurrent.ArrayBlockingQueue; | |
7 | |
8 import dna.AminoAcid; | |
9 import fileIO.FileFormat; | |
10 import fileIO.ReadWrite; | |
11 import fileIO.TextStreamWriter; | |
12 import jgi.BBMerge; | |
13 import shared.PreParser; | |
14 import shared.Shared; | |
15 import shared.Timer; | |
16 import shared.Tools; | |
17 import stream.ConcurrentReadInputStream; | |
18 import stream.Read; | |
19 import structures.IntList; | |
20 import structures.ListNum; | |
21 | |
22 /** | |
23 * @author Brian Bushnell | |
24 * @date Mar 4, 2015 | |
25 * | |
26 */ | |
27 public class TableLoaderLockFree { | |
28 | |
29 /*--------------------------------------------------------------*/ | |
30 /*---------------- Initialization ----------------*/ | |
31 /*--------------------------------------------------------------*/ | |
32 | |
33 /** | |
34 * Code entrance from the command line. | |
35 * @param args Command line arguments | |
36 */ | |
37 public static void main(String[] args){ | |
38 | |
39 {//Preparse block for help, config files, and outstream | |
40 PreParser pp=new PreParser(args, null, false); | |
41 args=pp.args; | |
42 outstream=pp.outstream; | |
43 } | |
44 | |
45 Timer t=new Timer(); | |
46 | |
47 AbstractKmerTable[] tables=makeTables(AbstractKmerTable.ARRAY1D, 12, -1L, false, 1.0); | |
48 | |
49 int k=31; | |
50 int mink=0; | |
51 int speed=0; | |
52 int hdist=0; | |
53 int edist=0; | |
54 boolean rcomp=true; | |
55 boolean maskMiddle=false; | |
56 | |
57 //Create a new Loader instance | |
58 TableLoaderLockFree loader=new TableLoaderLockFree(tables, k, mink, speed, hdist, edist, rcomp, maskMiddle); | |
59 loader.setRefSkip(0); | |
60 loader.hammingDistance2=0; | |
61 loader.editDistance2=0; | |
62 loader.storeMode(SET_IF_NOT_PRESENT); | |
63 | |
64 ///And run it | |
65 String[] refs=args; | |
66 String[] literals=null; | |
67 boolean keepNames=false; | |
68 boolean useRefNames=false; | |
69 long kmers=loader.processData(refs, literals, keepNames, useRefNames, false); | |
70 t.stop(); | |
71 | |
72 outstream.println("Time: \t"+t); | |
73 outstream.println("Return: \t"+kmers); | |
74 outstream.println("refKmers: \t"+loader.refKmers); | |
75 outstream.println("refBases: \t"+loader.refBases); | |
76 outstream.println("refReads: \t"+loader.refReads); | |
77 | |
78 //Close the print stream if it was redirected | |
79 Shared.closeStream(outstream); | |
80 } | |
81 | |
82 public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_){ | |
83 this(tables_, k_, 0, 0, 0, 0, true, false); | |
84 } | |
85 | |
86 public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_, int mink_, int speed_, int hdist_, int edist_, boolean rcomp_, boolean maskMiddle_){ | |
87 tables=tables_; | |
88 k=k_; | |
89 k2=k-1; | |
90 mink=mink_; | |
91 rcomp=rcomp_; | |
92 useShortKmers=(mink>0 && mink<k); | |
93 speed=speed_; | |
94 hammingDistance=hdist_; | |
95 editDistance=edist_; | |
96 middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L; | |
97 } | |
98 | |
99 | |
100 /*--------------------------------------------------------------*/ | |
101 /*---------------- Outer Methods ----------------*/ | |
102 /*--------------------------------------------------------------*/ | |
103 | |
104 | |
105 // public static AbstractKmerTable[] makeTables(int tableType, int initialSize, long coreMask, boolean growable){ | |
106 // return AbstractKmerTable.preallocate(WAYS, tableType, initialSize, coreMask, growable); | |
107 // } | |
108 | |
109 // public static AbstractKmerTable[] makeTables(int tableType, int[] schedule, long coreMask){ | |
110 // return AbstractKmerTable.preallocate(WAYS, tableType, schedule, coreMask); | |
111 // } | |
112 | |
113 public static AbstractKmerTable[] makeTables(int tableType, int bytesPerKmer, long coreMask, | |
114 boolean prealloc, double memRatio){ | |
115 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, bytesPerKmer, prealloc, memRatio); | |
116 int[] schedule=scheduleMaker.makeSchedule(); | |
117 return AbstractKmerTable.preallocate(WAYS, tableType, schedule, coreMask); | |
118 } | |
119 | |
120 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, false, 0.8); | |
121 int[] schedule=scheduleMaker.makeSchedule(); | |
122 | |
123 public long processData(String[] ref, String[] literal, boolean keepNames, boolean useRefNames, boolean ecc_){ | |
124 | |
125 scaffoldNames=null; | |
126 refNames=null; | |
127 refScafCounts=null; | |
128 scaffoldLengths=null; | |
129 ecc=ecc_; | |
130 | |
131 if(keepNames){ | |
132 scaffoldNames=new ArrayList<String>(); | |
133 refNames=new ArrayList<String>(); | |
134 scaffoldLengths=new IntList(); | |
135 | |
136 if(ref!=null){ | |
137 for(String s : ref){refNames.add(s);} | |
138 } | |
139 if(literal!=null){refNames.add("literal");} | |
140 refScafCounts=new int[refNames.size()]; | |
141 | |
142 if(useRefNames){toRefNames();} | |
143 } | |
144 | |
145 return spawnLoadThreads(ref, literal); | |
146 } | |
147 | |
148 public void setRefSkip(int x){setRefSkip(x, x);} | |
149 | |
150 public void setRefSkip(int min, int max){ | |
151 max=Tools.max(min, max); | |
152 if(min==max){ | |
153 minRefSkip=maxRefSkip=min; | |
154 }else{ | |
155 minRefSkip=min; | |
156 maxRefSkip=max; | |
157 } | |
158 variableRefSkip=(minRefSkip!=maxRefSkip); | |
159 } | |
160 | |
161 public void storeMode(final int x){ | |
162 assert(x==SET_IF_NOT_PRESENT || x==SET_ALWAYS || x==INCREMENT); | |
163 storeMode=x; | |
164 } | |
165 | |
166 /*--------------------------------------------------------------*/ | |
167 /*---------------- Inner Methods ----------------*/ | |
168 /*--------------------------------------------------------------*/ | |
169 | |
170 | |
171 /** | |
172 * Fills tables with kmers from references, using multiple LoadThread. | |
173 * @return Number of kmers stored. | |
174 */ | |
175 private long spawnLoadThreads(String[] ref, String[] literal){ | |
176 Timer t=new Timer(); | |
177 if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;} | |
178 long added=0; | |
179 | |
180 /* Create load threads */ | |
181 LoadThread[] loaders=new LoadThread[WAYS]; | |
182 for(int i=0; i<loaders.length; i++){ | |
183 loaders[i]=new LoadThread(i); | |
184 loaders[i].start(); | |
185 } | |
186 | |
187 /* For each reference file... */ | |
188 int refNum=0; | |
189 if(ref!=null){ | |
190 for(String refname : ref){ | |
191 | |
192 /* Start an input stream */ | |
193 FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true); | |
194 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true); | |
195 cris.start(); //4567 | |
196 ListNum<Read> ln=cris.nextList(); | |
197 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
198 | |
199 /* Iterate through read lists from the input stream */ | |
200 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
201 { | |
202 /* Assign a unique ID number to each scaffold */ | |
203 ArrayList<Read> reads2=new ArrayList<Read>(reads); | |
204 if(scaffoldNames!=null){ | |
205 for(Read r1 : reads2){ | |
206 final Read r2=r1.mate; | |
207 final Integer id=scaffoldNames.size(); | |
208 if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);} | |
209 refScafCounts[refNum]++; | |
210 scaffoldNames.add(r1.id==null ? id.toString() : r1.id); | |
211 int len=r1.length(); | |
212 r1.obj=id; | |
213 if(r2!=null){ | |
214 r2.obj=id; | |
215 len+=r2.length(); | |
216 } | |
217 scaffoldLengths.add(len); | |
218 } | |
219 } | |
220 | |
221 if(REPLICATE_AMBIGUOUS){ | |
222 reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink)); | |
223 } | |
224 | |
225 /* Send a pointer to the read list to each LoadThread */ | |
226 for(LoadThread lt : loaders){ | |
227 boolean b=true; | |
228 while(b){ | |
229 try { | |
230 lt.queue.put(reads2); | |
231 b=false; | |
232 } catch (InterruptedException e) { | |
233 //TODO: This will hang due to still-running threads. | |
234 throw new RuntimeException(e); | |
235 } | |
236 } | |
237 } | |
238 } | |
239 | |
240 /* Dispose of the old list and fetch a new one */ | |
241 cris.returnList(ln); | |
242 ln=cris.nextList(); | |
243 reads=(ln!=null ? ln.list : null); | |
244 } | |
245 /* Cleanup */ | |
246 cris.returnList(ln); | |
247 errorState|=ReadWrite.closeStream(cris); | |
248 refNum++; | |
249 } | |
250 } | |
251 | |
252 /* If there are literal sequences to use as references */ | |
253 if(literal!=null){ | |
254 ArrayList<Read> list=new ArrayList<Read>(literal.length); | |
255 if(verbose){outstream.println("Adding literals "+Arrays.toString(literal));} | |
256 | |
257 /* Assign a unique ID number to each literal sequence */ | |
258 for(int i=0; i<literal.length; i++){ | |
259 if(scaffoldNames!=null){ | |
260 final Integer id=scaffoldNames.size(); | |
261 final Read r=new Read(literal[i].getBytes(), null, id); | |
262 refScafCounts[refNum]++; | |
263 scaffoldNames.add(id.toString()); | |
264 scaffoldLengths.add(r.length()); | |
265 r.obj=id; | |
266 }else{ | |
267 final Read r=new Read(literal[i].getBytes(), null, i); | |
268 list.add(r); | |
269 } | |
270 } | |
271 | |
272 if(REPLICATE_AMBIGUOUS){ | |
273 list=Tools.replicateAmbiguous(list, Tools.min(k, mink)); | |
274 } | |
275 | |
276 /* Send a pointer to the read list to each LoadThread */ | |
277 for(LoadThread lt : loaders){ | |
278 boolean b=true; | |
279 while(b){ | |
280 try { | |
281 lt.queue.put(list); | |
282 b=false; | |
283 } catch (InterruptedException e) { | |
284 //TODO: This will hang due to still-running threads. | |
285 throw new RuntimeException(e); | |
286 } | |
287 } | |
288 } | |
289 } | |
290 | |
291 /* Signal loaders to terminate */ | |
292 for(LoadThread lt : loaders){ | |
293 boolean b=true; | |
294 while(b){ | |
295 try { | |
296 lt.queue.put(POISON); | |
297 b=false; | |
298 } catch (InterruptedException e) { | |
299 //TODO: This will hang due to still-running threads. | |
300 throw new RuntimeException(e); | |
301 } | |
302 } | |
303 } | |
304 | |
305 /* Wait for loaders to die, and gather statistics */ | |
306 for(LoadThread lt : loaders){ | |
307 while(lt.getState()!=Thread.State.TERMINATED){ | |
308 try { | |
309 lt.join(); | |
310 } catch (InterruptedException e) { | |
311 // TODO Auto-generated catch block | |
312 e.printStackTrace(); | |
313 } | |
314 } | |
315 added+=lt.addedT; | |
316 refKmers+=lt.refKmersT; | |
317 refBases+=lt.refBasesT; | |
318 refReads+=lt.refReadsT; | |
319 } | |
320 //Correct statistics for number of threads, since each thread processes all reference data | |
321 refKmers/=WAYS; | |
322 refBases/=WAYS; | |
323 refReads/=WAYS; | |
324 | |
325 t.stop(); | |
326 if(DISPLAY_PROGRESS){ | |
327 outstream.println("Added "+added+" kmers; time: \t"+t); | |
328 Shared.printMemory(); | |
329 outstream.println(); | |
330 } | |
331 | |
332 if(verbose){ | |
333 TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT); | |
334 tsw.start(); | |
335 for(AbstractKmerTable table : tables){ | |
336 table.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE); | |
337 } | |
338 tsw.poisonAndWait(); | |
339 } | |
340 | |
341 return added; | |
342 } | |
343 | |
344 | |
345 | |
346 /** | |
347 * Fills the scaffold names array with reference names. | |
348 */ | |
349 public void toRefNames(){ | |
350 final int numRefs=refNames.size(); | |
351 for(int r=0, s=1; r<numRefs; r++){ | |
352 final int scafs=refScafCounts[r]; | |
353 final int lim=s+scafs; | |
354 final String name=ReadWrite.stripToCore(refNames.get(r)); | |
355 // outstream.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name); | |
356 while(s<lim){ | |
357 // outstream.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name); | |
358 scaffoldNames.set(s, name); | |
359 s++; | |
360 } | |
361 } | |
362 } | |
363 | |
364 /*--------------------------------------------------------------*/ | |
365 /*---------------- Inner Classes ----------------*/ | |
366 /*--------------------------------------------------------------*/ | |
367 | |
368 /** | |
369 * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum. | |
370 */ | |
371 private class LoadThread extends Thread{ | |
372 | |
373 public LoadThread(final int tnum_){ | |
374 tnum=tnum_; | |
375 map=tables[tnum]; | |
376 } | |
377 | |
378 /** | |
379 * Get the next list of reads (or scaffolds) from the queue. | |
380 * @return List of reads | |
381 */ | |
382 private ArrayList<Read> fetch(){ | |
383 ArrayList<Read> list=null; | |
384 while(list==null){ | |
385 try { | |
386 list=queue.take(); | |
387 } catch (InterruptedException e) { | |
388 // TODO Auto-generated catch block | |
389 e.printStackTrace(); | |
390 } | |
391 } | |
392 return list; | |
393 } | |
394 | |
395 @Override | |
396 public void run(){ | |
397 ArrayList<Read> reads=fetch(); | |
398 while(reads!=POISON){ | |
399 for(Read r1 : reads){ | |
400 assert(r1.pairnum()==0); | |
401 final Read r2=r1.mate; | |
402 | |
403 addedT+=addToMap(r1, minRefSkip); | |
404 if(r2!=null){addedT+=addToMap(r2, minRefSkip);} | |
405 } | |
406 reads=fetch(); | |
407 } | |
408 | |
409 if(map.canRebalance() && map.size()>2L*map.arrayLength()){ | |
410 map.rebalance(); | |
411 } | |
412 } | |
413 | |
414 /** | |
415 * @param r The current read to process | |
416 * @param skip Number of bases to skip between kmers | |
417 * @return Number of kmers stored | |
418 */ | |
419 private long addToMap(Read r, int skip){ | |
420 if(variableRefSkip){ | |
421 int rblen=r.length(); | |
422 skip=(rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0); | |
423 skip=Tools.mid(minRefSkip, maxRefSkip, skip); | |
424 } | |
425 final byte[] bases=r.bases; | |
426 final int shift=2*k; | |
427 final int shift2=shift-2; | |
428 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); | |
429 final long kmask=kMasks[k]; | |
430 long kmer=0; | |
431 long rkmer=0; | |
432 long added=0; | |
433 int len=0; | |
434 | |
435 if(bases!=null){ | |
436 refReadsT++; | |
437 refBasesT+=bases.length; | |
438 } | |
439 if(bases==null || bases.length<k){return 0;} | |
440 | |
441 final int id=(r.obj==null ? 1 : ((Integer)r.obj).intValue()); | |
442 | |
443 if(skip>1){ //Process while skipping some kmers | |
444 for(int i=0; i<bases.length; i++){ | |
445 byte b=bases[i]; | |
446 long x=AminoAcid.baseToNumber[b]; | |
447 long x2=AminoAcid.baseToComplementNumber[b]; | |
448 kmer=((kmer<<2)|x)&mask; | |
449 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; | |
450 if(x<0){len=0; rkmer=0;}else{len++;} | |
451 if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} | |
452 if(len>=k){ | |
453 refKmersT++; | |
454 if(len%skip==0){ | |
455 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); | |
456 added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); | |
457 if(useShortKmers){ | |
458 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} | |
459 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} | |
460 } | |
461 } | |
462 } | |
463 } | |
464 }else{ //Process all kmers | |
465 for(int i=0; i<bases.length; i++){ | |
466 byte b=bases[i]; | |
467 long x=AminoAcid.baseToNumber[b]; | |
468 long x2=AminoAcid.baseToComplementNumber[b]; | |
469 kmer=((kmer<<2)|x)&mask; | |
470 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; | |
471 if(x<0){len=0; rkmer=0;}else{len++;} | |
472 if(verbose){outstream.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} | |
473 if(len>=k){ | |
474 refKmersT++; | |
475 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); | |
476 final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); | |
477 added+=atm; | |
478 // assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask)); | |
479 if(useShortKmers){ | |
480 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} | |
481 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} | |
482 } | |
483 } | |
484 } | |
485 } | |
486 return added; | |
487 } | |
488 | |
489 | |
490 /** | |
491 * Adds short kmers on the left end of the read. | |
492 * @param kmer Forward kmer | |
493 * @param rkmer Reverse kmer | |
494 * @param extraBase Base added to end in case of deletions | |
495 * @param id Scaffold number | |
496 * @return Number of kmers stored | |
497 */ | |
498 private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){ | |
499 if(verbose){outstream.println("addToMapLeftShift");} | |
500 long added=0; | |
501 for(int i=k-1; i>=mink; i--){ | |
502 kmer=kmer&rightMasks[i]; | |
503 rkmer=rkmer>>>2; | |
504 long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2); | |
505 added+=x; | |
506 if(verbose){ | |
507 if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){ | |
508 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); | |
509 outstream.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)); | |
510 final long value=toValue(kmer, rkmer, kMasks[i]); | |
511 if(map.contains(value)){outstream.println("Found "+value);} | |
512 } | |
513 } | |
514 } | |
515 return added; | |
516 } | |
517 | |
518 | |
519 /** | |
520 * Adds short kmers on the right end of the read. | |
521 * @param kmer Forward kmer | |
522 * @param rkmer Reverse kmer | |
523 * @param id Scaffold number | |
524 * @return Number of kmers stored | |
525 */ | |
526 private long addToMapRightShift(long kmer, long rkmer, final int id){ | |
527 if(verbose){outstream.println("addToMapRightShift");} | |
528 long added=0; | |
529 for(int i=k-1; i>=mink; i--){ | |
530 long extraBase=kmer&3L; | |
531 kmer=kmer>>>2; | |
532 rkmer=rkmer&rightMasks[i]; | |
533 // assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i]; | |
534 // assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i]; | |
535 long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2); | |
536 added+=x; | |
537 if(verbose){ | |
538 if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){ | |
539 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); | |
540 outstream.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)); | |
541 final long value=toValue(kmer, rkmer, kMasks[i]); | |
542 if(map.contains(value)){outstream.println("Found "+value);} | |
543 } | |
544 } | |
545 } | |
546 return added; | |
547 } | |
548 | |
549 | |
550 /** | |
551 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance. | |
552 * @param kmer Forward kmer | |
553 * @param rkmer Reverse kmer | |
554 * @param len Kmer length | |
555 * @param extraBase Base added to end in case of deletions | |
556 * @param id Scaffold number | |
557 * @param kmask0 | |
558 * @return Number of kmers stored | |
559 */ | |
560 private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){ | |
561 | |
562 assert(kmask0==kMasks[len]) : kmask0+", "+len+", "+kMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(kMasks[len]); | |
563 | |
564 if(verbose){outstream.println("addToMap_A; len="+len+"; kMasks[len]="+kMasks[len]);} | |
565 assert((kmer&kmask0)==0); | |
566 final long added; | |
567 if(hdist==0){ | |
568 final long key=toValue(kmer, rkmer, kmask0); | |
569 if(failsSpeed(key)){return 0;} | |
570 if(key%WAYS!=tnum){return 0;} | |
571 if(verbose){outstream.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);} | |
572 if(storeMode==SET_IF_NOT_PRESENT){ | |
573 added=map.setIfNotPresent(key, id); | |
574 }else if(storeMode==SET_ALWAYS){ | |
575 added=map.set(key, id); | |
576 }else{ | |
577 assert(storeMode==INCREMENT); | |
578 added=map.increment(key, 1); | |
579 } | |
580 }else if(edist>0){ | |
581 // long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); | |
582 added=mutate(kmer, rkmer, len, id, edist, extraBase); | |
583 }else{ | |
584 added=mutate(kmer, rkmer, len, id, hdist, -1); | |
585 } | |
586 if(verbose){outstream.println("addToMap added "+added+" keys.");} | |
587 return added; | |
588 } | |
589 | |
590 /** | |
591 * Mutate and store this kmer through 'dist' recursions. | |
592 * @param kmer Forward kmer | |
593 * @param rkmer Reverse kmer | |
594 * @param id Scaffold number | |
595 * @param dist Number of mutations | |
596 * @param extraBase Base added to end in case of deletions | |
597 * @return Number of kmers stored | |
598 */ | |
599 private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){ | |
600 long added=0; | |
601 | |
602 final long key=toValue(kmer, rkmer, kMasks[len]); | |
603 | |
604 if(verbose){outstream.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+kMasks[len]);} | |
605 if(key%WAYS==tnum){ | |
606 if(verbose){outstream.println("mutate_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);} | |
607 int x; | |
608 if(storeMode==SET_IF_NOT_PRESENT){ | |
609 x=map.setIfNotPresent(key, id); | |
610 }else if(storeMode==SET_ALWAYS){ | |
611 x=map.set(key, id); | |
612 }else{ | |
613 assert(storeMode==INCREMENT); | |
614 x=map.increment(key, 1); | |
615 x=(x==1 ? 1 : 0); | |
616 } | |
617 if(verbose){outstream.println("mutate_B added "+x+" keys.");} | |
618 added+=x; | |
619 assert(map.contains(key)); | |
620 } | |
621 | |
622 if(dist>0){ | |
623 final int dist2=dist-1; | |
624 | |
625 //Sub | |
626 for(int j=0; j<4; j++){ | |
627 for(int i=0; i<len; i++){ | |
628 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; | |
629 if(temp!=kmer){ | |
630 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); | |
631 added+=mutate(temp, rtemp, len, id, dist2, extraBase); | |
632 } | |
633 } | |
634 } | |
635 | |
636 if(editDistance>0){ | |
637 //Del | |
638 if(extraBase>=0 && extraBase<=3){ | |
639 for(int i=1; i<len; i++){ | |
640 final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase; | |
641 if(temp!=kmer){ | |
642 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); | |
643 added+=mutate(temp, rtemp, len, id, dist2, -1); | |
644 } | |
645 } | |
646 } | |
647 | |
648 //Ins | |
649 final long eb2=kmer&3; | |
650 for(int i=1; i<len; i++){ | |
651 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2); | |
652 for(int j=0; j<4; j++){ | |
653 final long temp=temp0|setMasks[j][i-1]; | |
654 if(temp!=kmer){ | |
655 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); | |
656 added+=mutate(temp, rtemp, len, id, dist2, eb2); | |
657 } | |
658 } | |
659 } | |
660 } | |
661 | |
662 } | |
663 | |
664 return added; | |
665 } | |
666 | |
667 /*--------------------------------------------------------------*/ | |
668 | |
669 /** Number of kmers stored by this thread */ | |
670 public long addedT=0; | |
671 /** Number of items encountered by this thread */ | |
672 public long refKmersT=0, refReadsT=0, refBasesT=0; | |
673 /** Thread number; used to determine which kmers to store */ | |
674 public final int tnum; | |
675 /** Buffer of input read lists */ | |
676 public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32); | |
677 | |
678 /** Destination for storing kmers */ | |
679 private final AbstractKmerTable map; | |
680 | |
681 } | |
682 | |
683 /*--------------------------------------------------------------*/ | |
684 /*---------------- Static Methods ----------------*/ | |
685 /*--------------------------------------------------------------*/ | |
686 | |
687 /** | |
688 * Transforms a kmer into a canonical value stored in the table. Expected to be inlined. | |
689 * @param kmer Forward kmer | |
690 * @param rkmer Reverse kmer | |
691 * @param lengthMask Bitmask with single '1' set to left of kmer | |
692 * @return Canonical value | |
693 */ | |
694 private final long toValue(long kmer, long rkmer, long lengthMask){ | |
695 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer; | |
696 long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); | |
697 return (value&middleMask)|lengthMask; | |
698 } | |
699 | |
700 final boolean passesSpeed(long key){ | |
701 return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed; | |
702 } | |
703 | |
704 final boolean failsSpeed(long key){ | |
705 return speed>0 && ((key&Long.MAX_VALUE)%17)<speed; | |
706 } | |
707 | |
708 /*--------------------------------------------------------------*/ | |
709 /*---------------- Fields ----------------*/ | |
710 /*--------------------------------------------------------------*/ | |
711 | |
712 /** Has this class encountered errors while processing? */ | |
713 public boolean errorState=false; | |
714 | |
715 /** How to associate values with kmers */ | |
716 private int storeMode=SET_IF_NOT_PRESENT; | |
717 | |
718 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ | |
719 public AbstractKmerTable[] tables; | |
720 /** A scaffold's name is stored at scaffoldNames.get(id). | |
721 * scaffoldNames[0] is reserved, so the first id is 1. */ | |
722 public ArrayList<String> scaffoldNames; | |
723 /** Names of reference files (refNames[0] is valid). */ | |
724 public ArrayList<String> refNames; | |
725 /** Number of scaffolds per reference. */ | |
726 public int[] refScafCounts; | |
727 /** scaffoldLengths[id] stores the length of that scaffold */ | |
728 public IntList scaffoldLengths=new IntList(); | |
729 | |
730 /** Make the middle base in a kmer a wildcard to improve sensitivity */ | |
731 public final boolean maskMiddle=false; | |
732 /** Correct errors via read overlap */ | |
733 public boolean ecc=false; | |
734 | |
735 /** Store reference kmers with up to this many substitutions */ | |
736 public final int hammingDistance; | |
737 /** Store reference kmers with up to this many edits (including indels) */ | |
738 public final int editDistance; | |
739 /** Store short reference kmers with up to this many substitutions */ | |
740 public int hammingDistance2=-1; | |
741 /** Store short reference kmers with up to this many edits (including indels) */ | |
742 public int editDistance2=-1; | |
743 /** Always skip at least this many consecutive kmers when hashing reference. | |
744 * 1 means every kmer is used, 2 means every other, etc. */ | |
745 private int minRefSkip=0; | |
746 /** Never skip more than this many consecutive kmers when hashing reference. */ | |
747 private int maxRefSkip=0; | |
748 | |
749 private boolean variableRefSkip=false; | |
750 | |
751 /*--------------------------------------------------------------*/ | |
752 /*---------------- Statistics ----------------*/ | |
753 /*--------------------------------------------------------------*/ | |
754 | |
755 long refReads=0; | |
756 long refBases=0; | |
757 long refKmers=0; | |
758 | |
759 long storedKmers=0; | |
760 | |
761 /*--------------------------------------------------------------*/ | |
762 /*---------------- Final Primitives ----------------*/ | |
763 /*--------------------------------------------------------------*/ | |
764 | |
765 /** Look for reverse-complements as well as forward kmers. Default: true */ | |
766 private final boolean rcomp; | |
767 /** AND bitmask with 0's at the middle base */ | |
768 private final long middleMask; | |
769 | |
770 /** Normal kmer length */ | |
771 private final int k; | |
772 /** k-1; used in some expressions */ | |
773 private final int k2; | |
774 /** Shortest kmer to use for trimming */ | |
775 private final int mink; | |
776 /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */ | |
777 private final boolean useShortKmers; | |
778 | |
779 /** Fraction of kmers to skip, 0 to 16 out of 17 */ | |
780 private final int speed; | |
781 | |
782 /*--------------------------------------------------------------*/ | |
783 /*---------------- Static Fields ----------------*/ | |
784 /*--------------------------------------------------------------*/ | |
785 | |
786 /** Default initial size of data structures */ | |
787 private static final int initialSizeDefault=128000; | |
788 | |
789 /** Number of tables (and threads, during loading) */ | |
790 private static final int WAYS=7; //123 | |
791 /** Verbose messages */ | |
792 public static final boolean verbose=false; //123 | |
793 | |
794 /** Print messages to this stream */ | |
795 private static PrintStream outstream=System.err; | |
796 /** Display progress messages such as memory usage */ | |
797 public static boolean DISPLAY_PROGRESS=true; | |
798 /** Indicates end of input stream */ | |
799 private static final ArrayList<Read> POISON=new ArrayList<Read>(0); | |
800 /** Make unambiguous copies of ref sequences with ambiguous bases */ | |
801 public static boolean REPLICATE_AMBIGUOUS=false; | |
802 | |
803 /** x&clearMasks[i] will clear base i */ | |
804 private static final long[] clearMasks; | |
805 /** x|setMasks[i][j] will set base i to j */ | |
806 private static final long[][] setMasks; | |
807 /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */ | |
808 private static final long[] leftMasks; | |
809 /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */ | |
810 private static final long[] rightMasks; | |
811 /** x|kMasks[i] will set the bit to the left of the leftmost base */ | |
812 private static final long[] kMasks; | |
813 | |
814 public static final int SET_IF_NOT_PRESENT=1, SET_ALWAYS=2, INCREMENT=3; | |
815 | |
816 /*--------------------------------------------------------------*/ | |
817 /*---------------- Static Initializers ----------------*/ | |
818 /*--------------------------------------------------------------*/ | |
819 | |
820 static{ | |
821 clearMasks=new long[32]; | |
822 leftMasks=new long[32]; | |
823 rightMasks=new long[32]; | |
824 kMasks=new long[32]; | |
825 setMasks=new long[4][32]; | |
826 for(int i=0; i<32; i++){ | |
827 clearMasks[i]=~(3L<<(2*i)); | |
828 } | |
829 for(int i=0; i<32; i++){ | |
830 leftMasks[i]=((-1L)<<(2*i)); | |
831 } | |
832 for(int i=0; i<32; i++){ | |
833 rightMasks[i]=~((-1L)<<(2*i)); | |
834 } | |
835 for(int i=0; i<32; i++){ | |
836 kMasks[i]=((1L)<<(2*i)); | |
837 } | |
838 for(int i=0; i<32; i++){ | |
839 for(long j=0; j<4; j++){ | |
840 setMasks[(int)j][i]=(j<<(2*i)); | |
841 } | |
842 } | |
843 } | |
844 | |
845 } |