annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/kmer/TableLoaderLockFree.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package kmer;
jpayne@68 2
jpayne@68 3 import java.io.PrintStream;
jpayne@68 4 import java.util.ArrayList;
jpayne@68 5 import java.util.Arrays;
jpayne@68 6 import java.util.concurrent.ArrayBlockingQueue;
jpayne@68 7
jpayne@68 8 import dna.AminoAcid;
jpayne@68 9 import fileIO.FileFormat;
jpayne@68 10 import fileIO.ReadWrite;
jpayne@68 11 import fileIO.TextStreamWriter;
jpayne@68 12 import jgi.BBMerge;
jpayne@68 13 import shared.PreParser;
jpayne@68 14 import shared.Shared;
jpayne@68 15 import shared.Timer;
jpayne@68 16 import shared.Tools;
jpayne@68 17 import stream.ConcurrentReadInputStream;
jpayne@68 18 import stream.Read;
jpayne@68 19 import structures.IntList;
jpayne@68 20 import structures.ListNum;
jpayne@68 21
jpayne@68 22 /**
jpayne@68 23 * @author Brian Bushnell
jpayne@68 24 * @date Mar 4, 2015
jpayne@68 25 *
jpayne@68 26 */
jpayne@68 27 public class TableLoaderLockFree {
jpayne@68 28
jpayne@68 29 /*--------------------------------------------------------------*/
jpayne@68 30 /*---------------- Initialization ----------------*/
jpayne@68 31 /*--------------------------------------------------------------*/
jpayne@68 32
jpayne@68 33 /**
jpayne@68 34 * Code entrance from the command line.
jpayne@68 35 * @param args Command line arguments
jpayne@68 36 */
jpayne@68 37 public static void main(String[] args){
jpayne@68 38
jpayne@68 39 {//Preparse block for help, config files, and outstream
jpayne@68 40 PreParser pp=new PreParser(args, null, false);
jpayne@68 41 args=pp.args;
jpayne@68 42 outstream=pp.outstream;
jpayne@68 43 }
jpayne@68 44
jpayne@68 45 Timer t=new Timer();
jpayne@68 46
jpayne@68 47 AbstractKmerTable[] tables=makeTables(AbstractKmerTable.ARRAY1D, 12, -1L, false, 1.0);
jpayne@68 48
jpayne@68 49 int k=31;
jpayne@68 50 int mink=0;
jpayne@68 51 int speed=0;
jpayne@68 52 int hdist=0;
jpayne@68 53 int edist=0;
jpayne@68 54 boolean rcomp=true;
jpayne@68 55 boolean maskMiddle=false;
jpayne@68 56
jpayne@68 57 //Create a new Loader instance
jpayne@68 58 TableLoaderLockFree loader=new TableLoaderLockFree(tables, k, mink, speed, hdist, edist, rcomp, maskMiddle);
jpayne@68 59 loader.setRefSkip(0);
jpayne@68 60 loader.hammingDistance2=0;
jpayne@68 61 loader.editDistance2=0;
jpayne@68 62 loader.storeMode(SET_IF_NOT_PRESENT);
jpayne@68 63
jpayne@68 64 ///And run it
jpayne@68 65 String[] refs=args;
jpayne@68 66 String[] literals=null;
jpayne@68 67 boolean keepNames=false;
jpayne@68 68 boolean useRefNames=false;
jpayne@68 69 long kmers=loader.processData(refs, literals, keepNames, useRefNames, false);
jpayne@68 70 t.stop();
jpayne@68 71
jpayne@68 72 outstream.println("Time: \t"+t);
jpayne@68 73 outstream.println("Return: \t"+kmers);
jpayne@68 74 outstream.println("refKmers: \t"+loader.refKmers);
jpayne@68 75 outstream.println("refBases: \t"+loader.refBases);
jpayne@68 76 outstream.println("refReads: \t"+loader.refReads);
jpayne@68 77
jpayne@68 78 //Close the print stream if it was redirected
jpayne@68 79 Shared.closeStream(outstream);
jpayne@68 80 }
jpayne@68 81
jpayne@68 82 public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_){
jpayne@68 83 this(tables_, k_, 0, 0, 0, 0, true, false);
jpayne@68 84 }
jpayne@68 85
jpayne@68 86 public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_, int mink_, int speed_, int hdist_, int edist_, boolean rcomp_, boolean maskMiddle_){
jpayne@68 87 tables=tables_;
jpayne@68 88 k=k_;
jpayne@68 89 k2=k-1;
jpayne@68 90 mink=mink_;
jpayne@68 91 rcomp=rcomp_;
jpayne@68 92 useShortKmers=(mink>0 && mink<k);
jpayne@68 93 speed=speed_;
jpayne@68 94 hammingDistance=hdist_;
jpayne@68 95 editDistance=edist_;
jpayne@68 96 middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
jpayne@68 97 }
jpayne@68 98
jpayne@68 99
jpayne@68 100 /*--------------------------------------------------------------*/
jpayne@68 101 /*---------------- Outer Methods ----------------*/
jpayne@68 102 /*--------------------------------------------------------------*/
jpayne@68 103
jpayne@68 104
jpayne@68 105 // public static AbstractKmerTable[] makeTables(int tableType, int initialSize, long coreMask, boolean growable){
jpayne@68 106 // return AbstractKmerTable.preallocate(WAYS, tableType, initialSize, coreMask, growable);
jpayne@68 107 // }
jpayne@68 108
jpayne@68 109 // public static AbstractKmerTable[] makeTables(int tableType, int[] schedule, long coreMask){
jpayne@68 110 // return AbstractKmerTable.preallocate(WAYS, tableType, schedule, coreMask);
jpayne@68 111 // }
jpayne@68 112
jpayne@68 113 public static AbstractKmerTable[] makeTables(int tableType, int bytesPerKmer, long coreMask,
jpayne@68 114 boolean prealloc, double memRatio){
jpayne@68 115 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, bytesPerKmer, prealloc, memRatio);
jpayne@68 116 int[] schedule=scheduleMaker.makeSchedule();
jpayne@68 117 return AbstractKmerTable.preallocate(WAYS, tableType, schedule, coreMask);
jpayne@68 118 }
jpayne@68 119
jpayne@68 120 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, false, 0.8);
jpayne@68 121 int[] schedule=scheduleMaker.makeSchedule();
jpayne@68 122
jpayne@68 123 public long processData(String[] ref, String[] literal, boolean keepNames, boolean useRefNames, boolean ecc_){
jpayne@68 124
jpayne@68 125 scaffoldNames=null;
jpayne@68 126 refNames=null;
jpayne@68 127 refScafCounts=null;
jpayne@68 128 scaffoldLengths=null;
jpayne@68 129 ecc=ecc_;
jpayne@68 130
jpayne@68 131 if(keepNames){
jpayne@68 132 scaffoldNames=new ArrayList<String>();
jpayne@68 133 refNames=new ArrayList<String>();
jpayne@68 134 scaffoldLengths=new IntList();
jpayne@68 135
jpayne@68 136 if(ref!=null){
jpayne@68 137 for(String s : ref){refNames.add(s);}
jpayne@68 138 }
jpayne@68 139 if(literal!=null){refNames.add("literal");}
jpayne@68 140 refScafCounts=new int[refNames.size()];
jpayne@68 141
jpayne@68 142 if(useRefNames){toRefNames();}
jpayne@68 143 }
jpayne@68 144
jpayne@68 145 return spawnLoadThreads(ref, literal);
jpayne@68 146 }
jpayne@68 147
jpayne@68 148 public void setRefSkip(int x){setRefSkip(x, x);}
jpayne@68 149
jpayne@68 150 public void setRefSkip(int min, int max){
jpayne@68 151 max=Tools.max(min, max);
jpayne@68 152 if(min==max){
jpayne@68 153 minRefSkip=maxRefSkip=min;
jpayne@68 154 }else{
jpayne@68 155 minRefSkip=min;
jpayne@68 156 maxRefSkip=max;
jpayne@68 157 }
jpayne@68 158 variableRefSkip=(minRefSkip!=maxRefSkip);
jpayne@68 159 }
jpayne@68 160
jpayne@68 161 public void storeMode(final int x){
jpayne@68 162 assert(x==SET_IF_NOT_PRESENT || x==SET_ALWAYS || x==INCREMENT);
jpayne@68 163 storeMode=x;
jpayne@68 164 }
jpayne@68 165
jpayne@68 166 /*--------------------------------------------------------------*/
jpayne@68 167 /*---------------- Inner Methods ----------------*/
jpayne@68 168 /*--------------------------------------------------------------*/
jpayne@68 169
jpayne@68 170
jpayne@68 171 /**
jpayne@68 172 * Fills tables with kmers from references, using multiple LoadThread.
jpayne@68 173 * @return Number of kmers stored.
jpayne@68 174 */
jpayne@68 175 private long spawnLoadThreads(String[] ref, String[] literal){
jpayne@68 176 Timer t=new Timer();
jpayne@68 177 if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
jpayne@68 178 long added=0;
jpayne@68 179
jpayne@68 180 /* Create load threads */
jpayne@68 181 LoadThread[] loaders=new LoadThread[WAYS];
jpayne@68 182 for(int i=0; i<loaders.length; i++){
jpayne@68 183 loaders[i]=new LoadThread(i);
jpayne@68 184 loaders[i].start();
jpayne@68 185 }
jpayne@68 186
jpayne@68 187 /* For each reference file... */
jpayne@68 188 int refNum=0;
jpayne@68 189 if(ref!=null){
jpayne@68 190 for(String refname : ref){
jpayne@68 191
jpayne@68 192 /* Start an input stream */
jpayne@68 193 FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true);
jpayne@68 194 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
jpayne@68 195 cris.start(); //4567
jpayne@68 196 ListNum<Read> ln=cris.nextList();
jpayne@68 197 ArrayList<Read> reads=(ln!=null ? ln.list : null);
jpayne@68 198
jpayne@68 199 /* Iterate through read lists from the input stream */
jpayne@68 200 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 201 {
jpayne@68 202 /* Assign a unique ID number to each scaffold */
jpayne@68 203 ArrayList<Read> reads2=new ArrayList<Read>(reads);
jpayne@68 204 if(scaffoldNames!=null){
jpayne@68 205 for(Read r1 : reads2){
jpayne@68 206 final Read r2=r1.mate;
jpayne@68 207 final Integer id=scaffoldNames.size();
jpayne@68 208 if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
jpayne@68 209 refScafCounts[refNum]++;
jpayne@68 210 scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
jpayne@68 211 int len=r1.length();
jpayne@68 212 r1.obj=id;
jpayne@68 213 if(r2!=null){
jpayne@68 214 r2.obj=id;
jpayne@68 215 len+=r2.length();
jpayne@68 216 }
jpayne@68 217 scaffoldLengths.add(len);
jpayne@68 218 }
jpayne@68 219 }
jpayne@68 220
jpayne@68 221 if(REPLICATE_AMBIGUOUS){
jpayne@68 222 reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink));
jpayne@68 223 }
jpayne@68 224
jpayne@68 225 /* Send a pointer to the read list to each LoadThread */
jpayne@68 226 for(LoadThread lt : loaders){
jpayne@68 227 boolean b=true;
jpayne@68 228 while(b){
jpayne@68 229 try {
jpayne@68 230 lt.queue.put(reads2);
jpayne@68 231 b=false;
jpayne@68 232 } catch (InterruptedException e) {
jpayne@68 233 //TODO: This will hang due to still-running threads.
jpayne@68 234 throw new RuntimeException(e);
jpayne@68 235 }
jpayne@68 236 }
jpayne@68 237 }
jpayne@68 238 }
jpayne@68 239
jpayne@68 240 /* Dispose of the old list and fetch a new one */
jpayne@68 241 cris.returnList(ln);
jpayne@68 242 ln=cris.nextList();
jpayne@68 243 reads=(ln!=null ? ln.list : null);
jpayne@68 244 }
jpayne@68 245 /* Cleanup */
jpayne@68 246 cris.returnList(ln);
jpayne@68 247 errorState|=ReadWrite.closeStream(cris);
jpayne@68 248 refNum++;
jpayne@68 249 }
jpayne@68 250 }
jpayne@68 251
jpayne@68 252 /* If there are literal sequences to use as references */
jpayne@68 253 if(literal!=null){
jpayne@68 254 ArrayList<Read> list=new ArrayList<Read>(literal.length);
jpayne@68 255 if(verbose){outstream.println("Adding literals "+Arrays.toString(literal));}
jpayne@68 256
jpayne@68 257 /* Assign a unique ID number to each literal sequence */
jpayne@68 258 for(int i=0; i<literal.length; i++){
jpayne@68 259 if(scaffoldNames!=null){
jpayne@68 260 final Integer id=scaffoldNames.size();
jpayne@68 261 final Read r=new Read(literal[i].getBytes(), null, id);
jpayne@68 262 refScafCounts[refNum]++;
jpayne@68 263 scaffoldNames.add(id.toString());
jpayne@68 264 scaffoldLengths.add(r.length());
jpayne@68 265 r.obj=id;
jpayne@68 266 }else{
jpayne@68 267 final Read r=new Read(literal[i].getBytes(), null, i);
jpayne@68 268 list.add(r);
jpayne@68 269 }
jpayne@68 270 }
jpayne@68 271
jpayne@68 272 if(REPLICATE_AMBIGUOUS){
jpayne@68 273 list=Tools.replicateAmbiguous(list, Tools.min(k, mink));
jpayne@68 274 }
jpayne@68 275
jpayne@68 276 /* Send a pointer to the read list to each LoadThread */
jpayne@68 277 for(LoadThread lt : loaders){
jpayne@68 278 boolean b=true;
jpayne@68 279 while(b){
jpayne@68 280 try {
jpayne@68 281 lt.queue.put(list);
jpayne@68 282 b=false;
jpayne@68 283 } catch (InterruptedException e) {
jpayne@68 284 //TODO: This will hang due to still-running threads.
jpayne@68 285 throw new RuntimeException(e);
jpayne@68 286 }
jpayne@68 287 }
jpayne@68 288 }
jpayne@68 289 }
jpayne@68 290
jpayne@68 291 /* Signal loaders to terminate */
jpayne@68 292 for(LoadThread lt : loaders){
jpayne@68 293 boolean b=true;
jpayne@68 294 while(b){
jpayne@68 295 try {
jpayne@68 296 lt.queue.put(POISON);
jpayne@68 297 b=false;
jpayne@68 298 } catch (InterruptedException e) {
jpayne@68 299 //TODO: This will hang due to still-running threads.
jpayne@68 300 throw new RuntimeException(e);
jpayne@68 301 }
jpayne@68 302 }
jpayne@68 303 }
jpayne@68 304
jpayne@68 305 /* Wait for loaders to die, and gather statistics */
jpayne@68 306 for(LoadThread lt : loaders){
jpayne@68 307 while(lt.getState()!=Thread.State.TERMINATED){
jpayne@68 308 try {
jpayne@68 309 lt.join();
jpayne@68 310 } catch (InterruptedException e) {
jpayne@68 311 // TODO Auto-generated catch block
jpayne@68 312 e.printStackTrace();
jpayne@68 313 }
jpayne@68 314 }
jpayne@68 315 added+=lt.addedT;
jpayne@68 316 refKmers+=lt.refKmersT;
jpayne@68 317 refBases+=lt.refBasesT;
jpayne@68 318 refReads+=lt.refReadsT;
jpayne@68 319 }
jpayne@68 320 //Correct statistics for number of threads, since each thread processes all reference data
jpayne@68 321 refKmers/=WAYS;
jpayne@68 322 refBases/=WAYS;
jpayne@68 323 refReads/=WAYS;
jpayne@68 324
jpayne@68 325 t.stop();
jpayne@68 326 if(DISPLAY_PROGRESS){
jpayne@68 327 outstream.println("Added "+added+" kmers; time: \t"+t);
jpayne@68 328 Shared.printMemory();
jpayne@68 329 outstream.println();
jpayne@68 330 }
jpayne@68 331
jpayne@68 332 if(verbose){
jpayne@68 333 TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
jpayne@68 334 tsw.start();
jpayne@68 335 for(AbstractKmerTable table : tables){
jpayne@68 336 table.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);
jpayne@68 337 }
jpayne@68 338 tsw.poisonAndWait();
jpayne@68 339 }
jpayne@68 340
jpayne@68 341 return added;
jpayne@68 342 }
jpayne@68 343
jpayne@68 344
jpayne@68 345
jpayne@68 346 /**
jpayne@68 347 * Fills the scaffold names array with reference names.
jpayne@68 348 */
jpayne@68 349 public void toRefNames(){
jpayne@68 350 final int numRefs=refNames.size();
jpayne@68 351 for(int r=0, s=1; r<numRefs; r++){
jpayne@68 352 final int scafs=refScafCounts[r];
jpayne@68 353 final int lim=s+scafs;
jpayne@68 354 final String name=ReadWrite.stripToCore(refNames.get(r));
jpayne@68 355 // outstream.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
jpayne@68 356 while(s<lim){
jpayne@68 357 // outstream.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
jpayne@68 358 scaffoldNames.set(s, name);
jpayne@68 359 s++;
jpayne@68 360 }
jpayne@68 361 }
jpayne@68 362 }
jpayne@68 363
jpayne@68 364 /*--------------------------------------------------------------*/
jpayne@68 365 /*---------------- Inner Classes ----------------*/
jpayne@68 366 /*--------------------------------------------------------------*/
jpayne@68 367
jpayne@68 368 /**
jpayne@68 369 * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum.
jpayne@68 370 */
jpayne@68 371 private class LoadThread extends Thread{
jpayne@68 372
jpayne@68 373 public LoadThread(final int tnum_){
jpayne@68 374 tnum=tnum_;
jpayne@68 375 map=tables[tnum];
jpayne@68 376 }
jpayne@68 377
jpayne@68 378 /**
jpayne@68 379 * Get the next list of reads (or scaffolds) from the queue.
jpayne@68 380 * @return List of reads
jpayne@68 381 */
jpayne@68 382 private ArrayList<Read> fetch(){
jpayne@68 383 ArrayList<Read> list=null;
jpayne@68 384 while(list==null){
jpayne@68 385 try {
jpayne@68 386 list=queue.take();
jpayne@68 387 } catch (InterruptedException e) {
jpayne@68 388 // TODO Auto-generated catch block
jpayne@68 389 e.printStackTrace();
jpayne@68 390 }
jpayne@68 391 }
jpayne@68 392 return list;
jpayne@68 393 }
jpayne@68 394
jpayne@68 395 @Override
jpayne@68 396 public void run(){
jpayne@68 397 ArrayList<Read> reads=fetch();
jpayne@68 398 while(reads!=POISON){
jpayne@68 399 for(Read r1 : reads){
jpayne@68 400 assert(r1.pairnum()==0);
jpayne@68 401 final Read r2=r1.mate;
jpayne@68 402
jpayne@68 403 addedT+=addToMap(r1, minRefSkip);
jpayne@68 404 if(r2!=null){addedT+=addToMap(r2, minRefSkip);}
jpayne@68 405 }
jpayne@68 406 reads=fetch();
jpayne@68 407 }
jpayne@68 408
jpayne@68 409 if(map.canRebalance() && map.size()>2L*map.arrayLength()){
jpayne@68 410 map.rebalance();
jpayne@68 411 }
jpayne@68 412 }
jpayne@68 413
jpayne@68 414 /**
jpayne@68 415 * @param r The current read to process
jpayne@68 416 * @param skip Number of bases to skip between kmers
jpayne@68 417 * @return Number of kmers stored
jpayne@68 418 */
jpayne@68 419 private long addToMap(Read r, int skip){
jpayne@68 420 if(variableRefSkip){
jpayne@68 421 int rblen=r.length();
jpayne@68 422 skip=(rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0);
jpayne@68 423 skip=Tools.mid(minRefSkip, maxRefSkip, skip);
jpayne@68 424 }
jpayne@68 425 final byte[] bases=r.bases;
jpayne@68 426 final int shift=2*k;
jpayne@68 427 final int shift2=shift-2;
jpayne@68 428 final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
jpayne@68 429 final long kmask=kMasks[k];
jpayne@68 430 long kmer=0;
jpayne@68 431 long rkmer=0;
jpayne@68 432 long added=0;
jpayne@68 433 int len=0;
jpayne@68 434
jpayne@68 435 if(bases!=null){
jpayne@68 436 refReadsT++;
jpayne@68 437 refBasesT+=bases.length;
jpayne@68 438 }
jpayne@68 439 if(bases==null || bases.length<k){return 0;}
jpayne@68 440
jpayne@68 441 final int id=(r.obj==null ? 1 : ((Integer)r.obj).intValue());
jpayne@68 442
jpayne@68 443 if(skip>1){ //Process while skipping some kmers
jpayne@68 444 for(int i=0; i<bases.length; i++){
jpayne@68 445 byte b=bases[i];
jpayne@68 446 long x=AminoAcid.baseToNumber[b];
jpayne@68 447 long x2=AminoAcid.baseToComplementNumber[b];
jpayne@68 448 kmer=((kmer<<2)|x)&mask;
jpayne@68 449 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
jpayne@68 450 if(x<0){len=0; rkmer=0;}else{len++;}
jpayne@68 451 if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
jpayne@68 452 if(len>=k){
jpayne@68 453 refKmersT++;
jpayne@68 454 if(len%skip==0){
jpayne@68 455 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
jpayne@68 456 added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
jpayne@68 457 if(useShortKmers){
jpayne@68 458 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
jpayne@68 459 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
jpayne@68 460 }
jpayne@68 461 }
jpayne@68 462 }
jpayne@68 463 }
jpayne@68 464 }else{ //Process all kmers
jpayne@68 465 for(int i=0; i<bases.length; i++){
jpayne@68 466 byte b=bases[i];
jpayne@68 467 long x=AminoAcid.baseToNumber[b];
jpayne@68 468 long x2=AminoAcid.baseToComplementNumber[b];
jpayne@68 469 kmer=((kmer<<2)|x)&mask;
jpayne@68 470 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
jpayne@68 471 if(x<0){len=0; rkmer=0;}else{len++;}
jpayne@68 472 if(verbose){outstream.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
jpayne@68 473 if(len>=k){
jpayne@68 474 refKmersT++;
jpayne@68 475 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
jpayne@68 476 final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
jpayne@68 477 added+=atm;
jpayne@68 478 // assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask));
jpayne@68 479 if(useShortKmers){
jpayne@68 480 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
jpayne@68 481 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
jpayne@68 482 }
jpayne@68 483 }
jpayne@68 484 }
jpayne@68 485 }
jpayne@68 486 return added;
jpayne@68 487 }
jpayne@68 488
jpayne@68 489
jpayne@68 490 /**
jpayne@68 491 * Adds short kmers on the left end of the read.
jpayne@68 492 * @param kmer Forward kmer
jpayne@68 493 * @param rkmer Reverse kmer
jpayne@68 494 * @param extraBase Base added to end in case of deletions
jpayne@68 495 * @param id Scaffold number
jpayne@68 496 * @return Number of kmers stored
jpayne@68 497 */
jpayne@68 498 private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){
jpayne@68 499 if(verbose){outstream.println("addToMapLeftShift");}
jpayne@68 500 long added=0;
jpayne@68 501 for(int i=k-1; i>=mink; i--){
jpayne@68 502 kmer=kmer&rightMasks[i];
jpayne@68 503 rkmer=rkmer>>>2;
jpayne@68 504 long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2);
jpayne@68 505 added+=x;
jpayne@68 506 if(verbose){
jpayne@68 507 if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){
jpayne@68 508 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
jpayne@68 509 outstream.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i));
jpayne@68 510 final long value=toValue(kmer, rkmer, kMasks[i]);
jpayne@68 511 if(map.contains(value)){outstream.println("Found "+value);}
jpayne@68 512 }
jpayne@68 513 }
jpayne@68 514 }
jpayne@68 515 return added;
jpayne@68 516 }
jpayne@68 517
jpayne@68 518
jpayne@68 519 /**
jpayne@68 520 * Adds short kmers on the right end of the read.
jpayne@68 521 * @param kmer Forward kmer
jpayne@68 522 * @param rkmer Reverse kmer
jpayne@68 523 * @param id Scaffold number
jpayne@68 524 * @return Number of kmers stored
jpayne@68 525 */
jpayne@68 526 private long addToMapRightShift(long kmer, long rkmer, final int id){
jpayne@68 527 if(verbose){outstream.println("addToMapRightShift");}
jpayne@68 528 long added=0;
jpayne@68 529 for(int i=k-1; i>=mink; i--){
jpayne@68 530 long extraBase=kmer&3L;
jpayne@68 531 kmer=kmer>>>2;
jpayne@68 532 rkmer=rkmer&rightMasks[i];
jpayne@68 533 // assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i];
jpayne@68 534 // assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i];
jpayne@68 535 long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2);
jpayne@68 536 added+=x;
jpayne@68 537 if(verbose){
jpayne@68 538 if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){
jpayne@68 539 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
jpayne@68 540 outstream.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i));
jpayne@68 541 final long value=toValue(kmer, rkmer, kMasks[i]);
jpayne@68 542 if(map.contains(value)){outstream.println("Found "+value);}
jpayne@68 543 }
jpayne@68 544 }
jpayne@68 545 }
jpayne@68 546 return added;
jpayne@68 547 }
jpayne@68 548
jpayne@68 549
jpayne@68 550 /**
jpayne@68 551 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
jpayne@68 552 * @param kmer Forward kmer
jpayne@68 553 * @param rkmer Reverse kmer
jpayne@68 554 * @param len Kmer length
jpayne@68 555 * @param extraBase Base added to end in case of deletions
jpayne@68 556 * @param id Scaffold number
jpayne@68 557 * @param kmask0
jpayne@68 558 * @return Number of kmers stored
jpayne@68 559 */
jpayne@68 560 private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){
jpayne@68 561
jpayne@68 562 assert(kmask0==kMasks[len]) : kmask0+", "+len+", "+kMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(kMasks[len]);
jpayne@68 563
jpayne@68 564 if(verbose){outstream.println("addToMap_A; len="+len+"; kMasks[len]="+kMasks[len]);}
jpayne@68 565 assert((kmer&kmask0)==0);
jpayne@68 566 final long added;
jpayne@68 567 if(hdist==0){
jpayne@68 568 final long key=toValue(kmer, rkmer, kmask0);
jpayne@68 569 if(failsSpeed(key)){return 0;}
jpayne@68 570 if(key%WAYS!=tnum){return 0;}
jpayne@68 571 if(verbose){outstream.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);}
jpayne@68 572 if(storeMode==SET_IF_NOT_PRESENT){
jpayne@68 573 added=map.setIfNotPresent(key, id);
jpayne@68 574 }else if(storeMode==SET_ALWAYS){
jpayne@68 575 added=map.set(key, id);
jpayne@68 576 }else{
jpayne@68 577 assert(storeMode==INCREMENT);
jpayne@68 578 added=map.increment(key, 1);
jpayne@68 579 }
jpayne@68 580 }else if(edist>0){
jpayne@68 581 // long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
jpayne@68 582 added=mutate(kmer, rkmer, len, id, edist, extraBase);
jpayne@68 583 }else{
jpayne@68 584 added=mutate(kmer, rkmer, len, id, hdist, -1);
jpayne@68 585 }
jpayne@68 586 if(verbose){outstream.println("addToMap added "+added+" keys.");}
jpayne@68 587 return added;
jpayne@68 588 }
jpayne@68 589
jpayne@68 590 /**
jpayne@68 591 * Mutate and store this kmer through 'dist' recursions.
jpayne@68 592 * @param kmer Forward kmer
jpayne@68 593 * @param rkmer Reverse kmer
jpayne@68 594 * @param id Scaffold number
jpayne@68 595 * @param dist Number of mutations
jpayne@68 596 * @param extraBase Base added to end in case of deletions
jpayne@68 597 * @return Number of kmers stored
jpayne@68 598 */
jpayne@68 599 private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
jpayne@68 600 long added=0;
jpayne@68 601
jpayne@68 602 final long key=toValue(kmer, rkmer, kMasks[len]);
jpayne@68 603
jpayne@68 604 if(verbose){outstream.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+kMasks[len]);}
jpayne@68 605 if(key%WAYS==tnum){
jpayne@68 606 if(verbose){outstream.println("mutate_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);}
jpayne@68 607 int x;
jpayne@68 608 if(storeMode==SET_IF_NOT_PRESENT){
jpayne@68 609 x=map.setIfNotPresent(key, id);
jpayne@68 610 }else if(storeMode==SET_ALWAYS){
jpayne@68 611 x=map.set(key, id);
jpayne@68 612 }else{
jpayne@68 613 assert(storeMode==INCREMENT);
jpayne@68 614 x=map.increment(key, 1);
jpayne@68 615 x=(x==1 ? 1 : 0);
jpayne@68 616 }
jpayne@68 617 if(verbose){outstream.println("mutate_B added "+x+" keys.");}
jpayne@68 618 added+=x;
jpayne@68 619 assert(map.contains(key));
jpayne@68 620 }
jpayne@68 621
jpayne@68 622 if(dist>0){
jpayne@68 623 final int dist2=dist-1;
jpayne@68 624
jpayne@68 625 //Sub
jpayne@68 626 for(int j=0; j<4; j++){
jpayne@68 627 for(int i=0; i<len; i++){
jpayne@68 628 final long temp=(kmer&clearMasks[i])|setMasks[j][i];
jpayne@68 629 if(temp!=kmer){
jpayne@68 630 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
jpayne@68 631 added+=mutate(temp, rtemp, len, id, dist2, extraBase);
jpayne@68 632 }
jpayne@68 633 }
jpayne@68 634 }
jpayne@68 635
jpayne@68 636 if(editDistance>0){
jpayne@68 637 //Del
jpayne@68 638 if(extraBase>=0 && extraBase<=3){
jpayne@68 639 for(int i=1; i<len; i++){
jpayne@68 640 final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase;
jpayne@68 641 if(temp!=kmer){
jpayne@68 642 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
jpayne@68 643 added+=mutate(temp, rtemp, len, id, dist2, -1);
jpayne@68 644 }
jpayne@68 645 }
jpayne@68 646 }
jpayne@68 647
jpayne@68 648 //Ins
jpayne@68 649 final long eb2=kmer&3;
jpayne@68 650 for(int i=1; i<len; i++){
jpayne@68 651 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2);
jpayne@68 652 for(int j=0; j<4; j++){
jpayne@68 653 final long temp=temp0|setMasks[j][i-1];
jpayne@68 654 if(temp!=kmer){
jpayne@68 655 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
jpayne@68 656 added+=mutate(temp, rtemp, len, id, dist2, eb2);
jpayne@68 657 }
jpayne@68 658 }
jpayne@68 659 }
jpayne@68 660 }
jpayne@68 661
jpayne@68 662 }
jpayne@68 663
jpayne@68 664 return added;
jpayne@68 665 }
jpayne@68 666
jpayne@68 667 /*--------------------------------------------------------------*/
jpayne@68 668
jpayne@68 669 /** Number of kmers stored by this thread */
jpayne@68 670 public long addedT=0;
jpayne@68 671 /** Number of items encountered by this thread */
jpayne@68 672 public long refKmersT=0, refReadsT=0, refBasesT=0;
jpayne@68 673 /** Thread number; used to determine which kmers to store */
jpayne@68 674 public final int tnum;
jpayne@68 675 /** Buffer of input read lists */
jpayne@68 676 public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
jpayne@68 677
jpayne@68 678 /** Destination for storing kmers */
jpayne@68 679 private final AbstractKmerTable map;
jpayne@68 680
jpayne@68 681 }
jpayne@68 682
jpayne@68 683 /*--------------------------------------------------------------*/
jpayne@68 684 /*---------------- Static Methods ----------------*/
jpayne@68 685 /*--------------------------------------------------------------*/
jpayne@68 686
jpayne@68 687 /**
jpayne@68 688 * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
jpayne@68 689 * @param kmer Forward kmer
jpayne@68 690 * @param rkmer Reverse kmer
jpayne@68 691 * @param lengthMask Bitmask with single '1' set to left of kmer
jpayne@68 692 * @return Canonical value
jpayne@68 693 */
jpayne@68 694 private final long toValue(long kmer, long rkmer, long lengthMask){
jpayne@68 695 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
jpayne@68 696 long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
jpayne@68 697 return (value&middleMask)|lengthMask;
jpayne@68 698 }
jpayne@68 699
jpayne@68 700 final boolean passesSpeed(long key){
jpayne@68 701 return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed;
jpayne@68 702 }
jpayne@68 703
jpayne@68 704 final boolean failsSpeed(long key){
jpayne@68 705 return speed>0 && ((key&Long.MAX_VALUE)%17)<speed;
jpayne@68 706 }
jpayne@68 707
jpayne@68 708 /*--------------------------------------------------------------*/
jpayne@68 709 /*---------------- Fields ----------------*/
jpayne@68 710 /*--------------------------------------------------------------*/
jpayne@68 711
jpayne@68 712 /** Has this class encountered errors while processing? */
jpayne@68 713 public boolean errorState=false;
jpayne@68 714
jpayne@68 715 /** How to associate values with kmers */
jpayne@68 716 private int storeMode=SET_IF_NOT_PRESENT;
jpayne@68 717
jpayne@68 718 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
jpayne@68 719 public AbstractKmerTable[] tables;
jpayne@68 720 /** A scaffold's name is stored at scaffoldNames.get(id).
jpayne@68 721 * scaffoldNames[0] is reserved, so the first id is 1. */
jpayne@68 722 public ArrayList<String> scaffoldNames;
jpayne@68 723 /** Names of reference files (refNames[0] is valid). */
jpayne@68 724 public ArrayList<String> refNames;
jpayne@68 725 /** Number of scaffolds per reference. */
jpayne@68 726 public int[] refScafCounts;
jpayne@68 727 /** scaffoldLengths[id] stores the length of that scaffold */
jpayne@68 728 public IntList scaffoldLengths=new IntList();
jpayne@68 729
jpayne@68 730 /** Make the middle base in a kmer a wildcard to improve sensitivity */
jpayne@68 731 public final boolean maskMiddle=false;
jpayne@68 732 /** Correct errors via read overlap */
jpayne@68 733 public boolean ecc=false;
jpayne@68 734
jpayne@68 735 /** Store reference kmers with up to this many substitutions */
jpayne@68 736 public final int hammingDistance;
jpayne@68 737 /** Store reference kmers with up to this many edits (including indels) */
jpayne@68 738 public final int editDistance;
jpayne@68 739 /** Store short reference kmers with up to this many substitutions */
jpayne@68 740 public int hammingDistance2=-1;
jpayne@68 741 /** Store short reference kmers with up to this many edits (including indels) */
jpayne@68 742 public int editDistance2=-1;
jpayne@68 743 /** Always skip at least this many consecutive kmers when hashing reference.
jpayne@68 744 * 1 means every kmer is used, 2 means every other, etc. */
jpayne@68 745 private int minRefSkip=0;
jpayne@68 746 /** Never skip more than this many consecutive kmers when hashing reference. */
jpayne@68 747 private int maxRefSkip=0;
jpayne@68 748
jpayne@68 749 private boolean variableRefSkip=false;
jpayne@68 750
jpayne@68 751 /*--------------------------------------------------------------*/
jpayne@68 752 /*---------------- Statistics ----------------*/
jpayne@68 753 /*--------------------------------------------------------------*/
jpayne@68 754
jpayne@68 755 long refReads=0;
jpayne@68 756 long refBases=0;
jpayne@68 757 long refKmers=0;
jpayne@68 758
jpayne@68 759 long storedKmers=0;
jpayne@68 760
jpayne@68 761 /*--------------------------------------------------------------*/
jpayne@68 762 /*---------------- Final Primitives ----------------*/
jpayne@68 763 /*--------------------------------------------------------------*/
jpayne@68 764
jpayne@68 765 /** Look for reverse-complements as well as forward kmers. Default: true */
jpayne@68 766 private final boolean rcomp;
jpayne@68 767 /** AND bitmask with 0's at the middle base */
jpayne@68 768 private final long middleMask;
jpayne@68 769
jpayne@68 770 /** Normal kmer length */
jpayne@68 771 private final int k;
jpayne@68 772 /** k-1; used in some expressions */
jpayne@68 773 private final int k2;
jpayne@68 774 /** Shortest kmer to use for trimming */
jpayne@68 775 private final int mink;
jpayne@68 776 /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
jpayne@68 777 private final boolean useShortKmers;
jpayne@68 778
jpayne@68 779 /** Fraction of kmers to skip, 0 to 16 out of 17 */
jpayne@68 780 private final int speed;
jpayne@68 781
jpayne@68 782 /*--------------------------------------------------------------*/
jpayne@68 783 /*---------------- Static Fields ----------------*/
jpayne@68 784 /*--------------------------------------------------------------*/
jpayne@68 785
jpayne@68 786 /** Default initial size of data structures */
jpayne@68 787 private static final int initialSizeDefault=128000;
jpayne@68 788
jpayne@68 789 /** Number of tables (and threads, during loading) */
jpayne@68 790 private static final int WAYS=7; //123
jpayne@68 791 /** Verbose messages */
jpayne@68 792 public static final boolean verbose=false; //123
jpayne@68 793
jpayne@68 794 /** Print messages to this stream */
jpayne@68 795 private static PrintStream outstream=System.err;
jpayne@68 796 /** Display progress messages such as memory usage */
jpayne@68 797 public static boolean DISPLAY_PROGRESS=true;
jpayne@68 798 /** Indicates end of input stream */
jpayne@68 799 private static final ArrayList<Read> POISON=new ArrayList<Read>(0);
jpayne@68 800 /** Make unambiguous copies of ref sequences with ambiguous bases */
jpayne@68 801 public static boolean REPLICATE_AMBIGUOUS=false;
jpayne@68 802
jpayne@68 803 /** x&clearMasks[i] will clear base i */
jpayne@68 804 private static final long[] clearMasks;
jpayne@68 805 /** x|setMasks[i][j] will set base i to j */
jpayne@68 806 private static final long[][] setMasks;
jpayne@68 807 /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
jpayne@68 808 private static final long[] leftMasks;
jpayne@68 809 /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
jpayne@68 810 private static final long[] rightMasks;
jpayne@68 811 /** x|kMasks[i] will set the bit to the left of the leftmost base */
jpayne@68 812 private static final long[] kMasks;
jpayne@68 813
jpayne@68 814 public static final int SET_IF_NOT_PRESENT=1, SET_ALWAYS=2, INCREMENT=3;
jpayne@68 815
jpayne@68 816 /*--------------------------------------------------------------*/
jpayne@68 817 /*---------------- Static Initializers ----------------*/
jpayne@68 818 /*--------------------------------------------------------------*/
jpayne@68 819
jpayne@68 820 static{
jpayne@68 821 clearMasks=new long[32];
jpayne@68 822 leftMasks=new long[32];
jpayne@68 823 rightMasks=new long[32];
jpayne@68 824 kMasks=new long[32];
jpayne@68 825 setMasks=new long[4][32];
jpayne@68 826 for(int i=0; i<32; i++){
jpayne@68 827 clearMasks[i]=~(3L<<(2*i));
jpayne@68 828 }
jpayne@68 829 for(int i=0; i<32; i++){
jpayne@68 830 leftMasks[i]=((-1L)<<(2*i));
jpayne@68 831 }
jpayne@68 832 for(int i=0; i<32; i++){
jpayne@68 833 rightMasks[i]=~((-1L)<<(2*i));
jpayne@68 834 }
jpayne@68 835 for(int i=0; i<32; i++){
jpayne@68 836 kMasks[i]=((1L)<<(2*i));
jpayne@68 837 }
jpayne@68 838 for(int i=0; i<32; i++){
jpayne@68 839 for(long j=0; j<4; j++){
jpayne@68 840 setMasks[(int)j][i]=(j<<(2*i));
jpayne@68 841 }
jpayne@68 842 }
jpayne@68 843 }
jpayne@68 844
jpayne@68 845 }