annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount6.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package bloom;
jpayne@68 2
jpayne@68 3 import java.util.ArrayList;
jpayne@68 4 import java.util.BitSet;
jpayne@68 5 import java.util.Locale;
jpayne@68 6
jpayne@68 7 import dna.AminoAcid;
jpayne@68 8 import fileIO.FileFormat;
jpayne@68 9 import shared.Timer;
jpayne@68 10 import stream.ConcurrentReadInputStream;
jpayne@68 11 import stream.FastaReadInputStream;
jpayne@68 12 import stream.Read;
jpayne@68 13 import structures.ListNum;
jpayne@68 14
jpayne@68 15 /**
jpayne@68 16 * @author Brian Bushnell
jpayne@68 17 * @date Jul 5, 2012
jpayne@68 18 *
jpayne@68 19 */
jpayne@68 20 public class KmerCount6 extends KmerCountAbstract {
jpayne@68 21
jpayne@68 22 public static void main(String[] args){
jpayne@68 23
jpayne@68 24 Timer t=new Timer();
jpayne@68 25
jpayne@68 26 String fname1=args[0];
jpayne@68 27 String fname2=(args.length>1 ? args[1] : null);
jpayne@68 28 int k=14;
jpayne@68 29 int cbits=16;
jpayne@68 30 int gap=0;
jpayne@68 31
jpayne@68 32 for(int i=2; i<args.length; i++){
jpayne@68 33 final String arg=args[i];
jpayne@68 34 final String[] split=arg.split("=");
jpayne@68 35 String a=split[0].toLowerCase();
jpayne@68 36 String b=split.length>1 ? split[1] : null;
jpayne@68 37
jpayne@68 38 if(a.equals("k") || a.equals("kmer")){
jpayne@68 39 k=Integer.parseInt(b);
jpayne@68 40 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
jpayne@68 41 cbits=Integer.parseInt(b);
jpayne@68 42 }else if(a.startsWith("gap")){
jpayne@68 43 gap=Integer.parseInt(b);
jpayne@68 44 }else{
jpayne@68 45 throw new RuntimeException("Unknown parameter "+args[i]);
jpayne@68 46 }
jpayne@68 47 }
jpayne@68 48
jpayne@68 49 KCountArray count=null;
jpayne@68 50
jpayne@68 51 if(fileIO.FileFormat.hasFastaExtension(fname1)){
jpayne@68 52 assert(!FastaReadInputStream.SPLIT_READS);
jpayne@68 53 FastaReadInputStream.MIN_READ_LEN=k;
jpayne@68 54 }
jpayne@68 55
jpayne@68 56 count=count(fname1, fname2, k, cbits, true, null);
jpayne@68 57
jpayne@68 58
jpayne@68 59 t.stop();
jpayne@68 60 System.out.println("Finished counting; time = "+t);
jpayne@68 61
jpayne@68 62 printStatistics(count);
jpayne@68 63
jpayne@68 64 }
jpayne@68 65
jpayne@68 66 public static void printStatistics(KCountArray count){
jpayne@68 67 long[] freq=count.transformToFrequency();
jpayne@68 68
jpayne@68 69 // System.out.println(count+"\n");
jpayne@68 70 // System.out.println(Arrays.toString(freq)+"\n");
jpayne@68 71
jpayne@68 72 long sum=sum(freq);
jpayne@68 73 System.out.println("Kmer fraction:");
jpayne@68 74 int lim1=8, lim2=16;
jpayne@68 75 for(int i=0; i<lim1; i++){
jpayne@68 76 String prefix=i+"";
jpayne@68 77 while(prefix.length()<8){prefix=prefix+" ";}
jpayne@68 78 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
jpayne@68 79 }
jpayne@68 80 while(lim1<=freq.length){
jpayne@68 81 int x=0;
jpayne@68 82 for(int i=lim1; i<lim2; i++){
jpayne@68 83 x+=freq[i];
jpayne@68 84 }
jpayne@68 85 String prefix=lim1+"-"+(lim2-1);
jpayne@68 86 if(lim2>=freq.length){prefix=lim1+"+";}
jpayne@68 87 while(prefix.length()<8){prefix=prefix+" ";}
jpayne@68 88 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
jpayne@68 89 lim1*=2;
jpayne@68 90 lim2=min(lim2*2, freq.length);
jpayne@68 91 }
jpayne@68 92
jpayne@68 93 long sum2=sum-freq[0];
jpayne@68 94 long x=freq[1];
jpayne@68 95 System.out.println();
jpayne@68 96 System.out.println("Keys Counted: \t \t"+keysCounted);
jpayne@68 97 System.out.println("Unique: \t \t"+sum2);
jpayne@68 98 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2)));
jpayne@68 99 System.out.println();
jpayne@68 100 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
jpayne@68 101 x=sum2-x;
jpayne@68 102 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
jpayne@68 103 }
jpayne@68 104
jpayne@68 105 public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){
jpayne@68 106 assert(k<32 && k>=1 && (count!=null || k<20));
jpayne@68 107 final int kbits=2*k;
jpayne@68 108 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
jpayne@68 109
jpayne@68 110 if(count==null){
jpayne@68 111 final long cells=1L<<kbits;
jpayne@68 112 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
jpayne@68 113 count=KCountArray.makeNew(cells, cbits);
jpayne@68 114 }
jpayne@68 115
jpayne@68 116 final ConcurrentReadInputStream cris;
jpayne@68 117 {
jpayne@68 118 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
jpayne@68 119 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
jpayne@68 120 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
jpayne@68 121 cris.start(); //4567
jpayne@68 122 }
jpayne@68 123
jpayne@68 124 assert(cris!=null) : reads1;
jpayne@68 125 System.err.println("Started cris");
jpayne@68 126 boolean paired=cris.paired();
jpayne@68 127 if(verbose){System.err.println("Paired: "+paired);}
jpayne@68 128
jpayne@68 129 count(cris, k, rcomp, count);
jpayne@68 130
jpayne@68 131 cris.close();
jpayne@68 132 if(verbose){System.err.println("Closed stream");}
jpayne@68 133 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
jpayne@68 134
jpayne@68 135
jpayne@68 136 return count;
jpayne@68 137 }
jpayne@68 138
jpayne@68 139
jpayne@68 140 public static void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray count){
jpayne@68 141 assert(k<32 && k>=1 && (count!=null || k<20));
jpayne@68 142
jpayne@68 143 assert(count!=null);
jpayne@68 144
jpayne@68 145 ListNum<Read> ln=cris.nextList();
jpayne@68 146 ArrayList<Read> reads=(ln!=null ? ln.list : null);
jpayne@68 147
jpayne@68 148
jpayne@68 149 if(true /*count.gap==0*/){
jpayne@68 150 final int kbits=2*k;
jpayne@68 151 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
jpayne@68 152
jpayne@68 153
jpayne@68 154 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 155 //System.err.println("reads.size()="+reads.size());
jpayne@68 156 for(Read r : reads){
jpayne@68 157 readsProcessed++;
jpayne@68 158
jpayne@68 159 addRead(r, count, k, mask, rcomp);
jpayne@68 160 if(r.mate!=null){
jpayne@68 161 addRead(r.mate, count, k, mask, rcomp);
jpayne@68 162 }
jpayne@68 163
jpayne@68 164 }
jpayne@68 165 //System.err.println("returning list");
jpayne@68 166 cris.returnList(ln);
jpayne@68 167 //System.err.println("fetching list");
jpayne@68 168 ln=cris.nextList();
jpayne@68 169 reads=(ln!=null ? ln.list : null);
jpayne@68 170 }
jpayne@68 171 }else{
jpayne@68 172 final int k1=(k+1)/2;
jpayne@68 173 final int k2=k/2;
jpayne@68 174 final int kbits1=2*k1;
jpayne@68 175 final int kbits2=2*k2;
jpayne@68 176 // final int gap=count.gap;
jpayne@68 177 int gap=0; assert(false);
jpayne@68 178 final long mask1=~((-1L)<<(kbits1));
jpayne@68 179 final long mask2=~((-1L)<<(kbits2));
jpayne@68 180 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 181 //System.err.println("reads.size()="+reads.size());
jpayne@68 182 for(Read r : reads){
jpayne@68 183 readsProcessed++;
jpayne@68 184
jpayne@68 185 addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
jpayne@68 186 if(r.mate!=null){
jpayne@68 187 addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
jpayne@68 188 }
jpayne@68 189
jpayne@68 190 }
jpayne@68 191 //System.err.println("returning list");
jpayne@68 192 cris.returnList(ln);
jpayne@68 193 //System.err.println("fetching list");
jpayne@68 194 ln=cris.nextList();
jpayne@68 195 reads=(ln!=null ? ln.list : null);
jpayne@68 196 }
jpayne@68 197 }
jpayne@68 198
jpayne@68 199 if(verbose){System.err.println("Finished reading");}
jpayne@68 200 cris.returnList(ln);
jpayne@68 201 if(verbose){System.err.println("Returned list");}
jpayne@68 202 }
jpayne@68 203
jpayne@68 204
jpayne@68 205
jpayne@68 206
jpayne@68 207 public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
jpayne@68 208 KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){
jpayne@68 209
jpayne@68 210 assert(k<32 && k>=1 && (count!=null || k<20));
jpayne@68 211 final int kbits=2*k;
jpayne@68 212 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
jpayne@68 213
jpayne@68 214 // System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
jpayne@68 215 // System.out.println("\ntrusted=\n"+trusted);
jpayne@68 216 // System.out.println("\ncount=\n"+count);
jpayne@68 217
jpayne@68 218 if(count==null){
jpayne@68 219 final long cells=1L<<kbits;
jpayne@68 220 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
jpayne@68 221 count=KCountArray.makeNew(cells, cbits);
jpayne@68 222 }
jpayne@68 223
jpayne@68 224 final ConcurrentReadInputStream cris;
jpayne@68 225 {
jpayne@68 226 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
jpayne@68 227 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
jpayne@68 228 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
jpayne@68 229 cris.start(); //4567
jpayne@68 230 }
jpayne@68 231
jpayne@68 232 assert(cris!=null) : reads1;
jpayne@68 233 System.err.println("Started cris");
jpayne@68 234 boolean paired=cris.paired();
jpayne@68 235 if(verbose){System.err.println("Paired: "+paired);}
jpayne@68 236
jpayne@68 237 count(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
jpayne@68 238
jpayne@68 239 cris.close();
jpayne@68 240 if(verbose){System.err.println("Closed stream");}
jpayne@68 241
jpayne@68 242 // System.out.println("*** after ***");
jpayne@68 243 // System.out.println("\ntrusted=\n"+trusted);
jpayne@68 244 // System.out.println("\ncount=\n"+count);
jpayne@68 245
jpayne@68 246 return count;
jpayne@68 247 }
jpayne@68 248
jpayne@68 249
jpayne@68 250
jpayne@68 251
jpayne@68 252 public static void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
jpayne@68 253 final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
jpayne@68 254
jpayne@68 255 assert(k<32 && k>=1 && (count!=null || k<20));
jpayne@68 256 final int kbits=2*k;
jpayne@68 257 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
jpayne@68 258
jpayne@68 259 ListNum<Read> ln=cris.nextList();
jpayne@68 260 ArrayList<Read> reads=(ln!=null ? ln.list : null);
jpayne@68 261
jpayne@68 262 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 263 //System.err.println("reads.size()="+reads.size());
jpayne@68 264 for(Read r : reads){
jpayne@68 265
jpayne@68 266 Read r2=r.mate;
jpayne@68 267 {
jpayne@68 268 if(trusted!=null){
jpayne@68 269 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
jpayne@68 270 ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
jpayne@68 271 // System.out.println("\n"+toString(bs, r.length()));
jpayne@68 272 // System.out.println(new String(r.bases));
jpayne@68 273 for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
jpayne@68 274 r.bases[i]='N';
jpayne@68 275 r.quality[i]=0;
jpayne@68 276 }
jpayne@68 277 // System.out.println(new String(r.bases));
jpayne@68 278 // System.out.println("used = "+String.format(Locale.ROOT, "%.3f%%",count.usedFraction()*100));
jpayne@68 279 // System.out.println("used = "+((KCountArray4)count).cellsUsed());
jpayne@68 280 // if(bs.length()<r.length()){r=null;}
jpayne@68 281 }
jpayne@68 282 // if(r!=null){addRead(r, count, k, mask, rcomp);}
jpayne@68 283 addRead(r, count, k, mask, rcomp);
jpayne@68 284 }
jpayne@68 285 if(r2!=null){
jpayne@68 286 if(trusted!=null){
jpayne@68 287 BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
jpayne@68 288 ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
jpayne@68 289 for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
jpayne@68 290 r2.bases[i]='N';
jpayne@68 291 r2.quality[i]=0;
jpayne@68 292 }
jpayne@68 293 }
jpayne@68 294 addRead(r2, count, k, mask, rcomp);
jpayne@68 295 }
jpayne@68 296
jpayne@68 297 }
jpayne@68 298 //System.err.println("returning list");
jpayne@68 299 cris.returnList(ln);
jpayne@68 300 //System.err.println("fetching list");
jpayne@68 301 ln=cris.nextList();
jpayne@68 302 reads=(ln!=null ? ln.list : null);
jpayne@68 303 }
jpayne@68 304
jpayne@68 305 if(verbose){System.err.println("Finished reading");}
jpayne@68 306 cris.returnList(ln);
jpayne@68 307 if(verbose){System.err.println("Returned list");}
jpayne@68 308 }
jpayne@68 309
jpayne@68 310
jpayne@68 311
jpayne@68 312 public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
jpayne@68 313 int len=0;
jpayne@68 314 long kmer=0;
jpayne@68 315 byte[] bases=r.bases;
jpayne@68 316 byte[] quals=r.quality;
jpayne@68 317 for(int i=0; i<bases.length; i++){
jpayne@68 318 byte b=bases[i];
jpayne@68 319 int x=AminoAcid.baseToNumber[b];
jpayne@68 320 if(x<0 || (quals!=null && quals[i]<minQuality)){
jpayne@68 321 len=0;
jpayne@68 322 kmer=0;
jpayne@68 323 }else{
jpayne@68 324 kmer=((kmer<<2)|x)&mask;
jpayne@68 325 len++;
jpayne@68 326 if(len>=k){
jpayne@68 327 keysCounted++;
jpayne@68 328 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
jpayne@68 329 count.increment(kmer);
jpayne@68 330 // System.out.println(" -> "+count.read(kmer));
jpayne@68 331 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
jpayne@68 332 // array[(int)kmer]++;
jpayne@68 333 // System.out.println(" -> "+array[(int)kmer]+"\n");
jpayne@68 334 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
jpayne@68 335 }
jpayne@68 336 }
jpayne@68 337 }
jpayne@68 338 if(rcomp){
jpayne@68 339 r.reverseComplement();
jpayne@68 340 addRead(r, count, k, mask, false);
jpayne@68 341 }
jpayne@68 342 }
jpayne@68 343
jpayne@68 344 public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
jpayne@68 345 int len=0;
jpayne@68 346 int shift=k2*2;
jpayne@68 347 long kmer1=0;
jpayne@68 348 long kmer2=0;
jpayne@68 349 byte[] bases=r.bases;
jpayne@68 350 byte[] quals=r.quality;
jpayne@68 351
jpayne@68 352 assert(kmer1>=kmer2);
jpayne@68 353
jpayne@68 354 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
jpayne@68 355
jpayne@68 356 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
jpayne@68 357 int x1=AminoAcid.baseToNumber[bases[i]];
jpayne@68 358 int x2=AminoAcid.baseToNumber[bases[j]];
jpayne@68 359 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
jpayne@68 360 len=0;
jpayne@68 361 kmer1=0;
jpayne@68 362 kmer2=0;
jpayne@68 363 }else{
jpayne@68 364 kmer1=((kmer1<<2)|x1)&mask1;
jpayne@68 365 kmer2=((kmer2<<2)|x2)&mask2;
jpayne@68 366 len++;
jpayne@68 367 if(len>=k1){
jpayne@68 368 keysCounted++;
jpayne@68 369 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
jpayne@68 370
jpayne@68 371 long key=(kmer1<<shift)|kmer2;
jpayne@68 372 // System.err.println(Long.toHexString(key));
jpayne@68 373 count.increment(key);
jpayne@68 374 // System.out.println(" -> "+count.read(kmer));
jpayne@68 375 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
jpayne@68 376 // array[(int)kmer]++;
jpayne@68 377 // System.out.println(" -> "+array[(int)kmer]+"\n");
jpayne@68 378 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
jpayne@68 379 }
jpayne@68 380 }
jpayne@68 381 }
jpayne@68 382 if(rcomp){
jpayne@68 383 r.reverseComplement();
jpayne@68 384 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
jpayne@68 385 }
jpayne@68 386 }
jpayne@68 387
jpayne@68 388 public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
jpayne@68 389 int len=0;
jpayne@68 390 int shift=k2*2;
jpayne@68 391 long kmer1=0;
jpayne@68 392 long kmer2=0;
jpayne@68 393 byte[] quals=null;
jpayne@68 394
jpayne@68 395 assert(kmer1>=kmer2);
jpayne@68 396
jpayne@68 397 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
jpayne@68 398
jpayne@68 399 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
jpayne@68 400 int x1=AminoAcid.baseToNumber[bases[i]];
jpayne@68 401 int x2=AminoAcid.baseToNumber[bases[j]];
jpayne@68 402 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
jpayne@68 403 len=0;
jpayne@68 404 kmer1=0;
jpayne@68 405 kmer2=0;
jpayne@68 406 }else{
jpayne@68 407 kmer1=((kmer1<<2)|x1)&mask1;
jpayne@68 408 kmer2=((kmer2<<2)|x2)&mask2;
jpayne@68 409 len++;
jpayne@68 410 if(len>=k1){
jpayne@68 411 keysCounted++;
jpayne@68 412 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
jpayne@68 413
jpayne@68 414 long key=(kmer1<<shift)|kmer2;
jpayne@68 415 System.out.println(Long.toHexString(kmer1));
jpayne@68 416 System.out.println(Long.toHexString(kmer2));
jpayne@68 417 System.out.println(Long.toHexString(key));
jpayne@68 418 count.increment(key);
jpayne@68 419 // System.out.println(" -> "+count.read(kmer));
jpayne@68 420 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
jpayne@68 421 // array[(int)kmer]++;
jpayne@68 422 // System.out.println(" -> "+array[(int)kmer]+"\n");
jpayne@68 423 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
jpayne@68 424 }
jpayne@68 425 }
jpayne@68 426 }
jpayne@68 427 if(rcomp){
jpayne@68 428 AminoAcid.reverseComplementBasesInPlace(bases);
jpayne@68 429 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
jpayne@68 430 }
jpayne@68 431 }
jpayne@68 432
jpayne@68 433 }