annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package bloom;
jpayne@68 2
jpayne@68 3 import java.util.ArrayList;
jpayne@68 4 import java.util.Locale;
jpayne@68 5
jpayne@68 6 import dna.AminoAcid;
jpayne@68 7 import fileIO.ReadWrite;
jpayne@68 8 import shared.Timer;
jpayne@68 9 import stream.ConcurrentGenericReadInputStream;
jpayne@68 10 import stream.FastqReadInputStream;
jpayne@68 11 import stream.Read;
jpayne@68 12 import structures.ListNum;
jpayne@68 13
jpayne@68 14 /**
jpayne@68 15 * @author Brian Bushnell
jpayne@68 16 * @date Jul 6, 2012
jpayne@68 17 *
jpayne@68 18 */
jpayne@68 19 public class LargeKmerCount {
jpayne@68 20
jpayne@68 21 public static void main(String[] args){
jpayne@68 22
jpayne@68 23 Timer t=new Timer();
jpayne@68 24
jpayne@68 25 String fname1=args[0];
jpayne@68 26 String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null);
jpayne@68 27 int indexbits=Integer.parseInt(args[args.length-3]);
jpayne@68 28 int cbits=Integer.parseInt(args[args.length-2]);
jpayne@68 29 int k=Integer.parseInt(args[args.length-1]);
jpayne@68 30
jpayne@68 31 KCountArray2 count=countFastq(fname1, fname2, indexbits, cbits, k);
jpayne@68 32 t.stop();
jpayne@68 33 System.out.println("Finished counting; time = "+t);
jpayne@68 34
jpayne@68 35 long[] freq=count.transformToFrequency();
jpayne@68 36
jpayne@68 37 // System.out.println(count+"\n");
jpayne@68 38 // System.out.println(Arrays.toString(freq)+"\n");
jpayne@68 39
jpayne@68 40 long sum=sum(freq);
jpayne@68 41 System.out.println("Kmer fraction:");
jpayne@68 42 int lim1=8, lim2=16;
jpayne@68 43 for(int i=0; i<lim1; i++){
jpayne@68 44 String prefix=i+"";
jpayne@68 45 while(prefix.length()<8){prefix=prefix+" ";}
jpayne@68 46 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
jpayne@68 47 }
jpayne@68 48 while(lim1<=freq.length){
jpayne@68 49 int x=0;
jpayne@68 50 for(int i=lim1; i<lim2; i++){
jpayne@68 51 x+=freq[i];
jpayne@68 52 }
jpayne@68 53 String prefix=lim1+"-"+(lim2-1);
jpayne@68 54 if(lim2>=freq.length){prefix=lim1+"+";}
jpayne@68 55 while(prefix.length()<8){prefix=prefix+" ";}
jpayne@68 56 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
jpayne@68 57 lim1*=2;
jpayne@68 58 lim2=min(lim2*2, freq.length);
jpayne@68 59 }
jpayne@68 60
jpayne@68 61 long sum2=sum-freq[0];
jpayne@68 62 long x=freq[1];
jpayne@68 63 System.out.println();
jpayne@68 64 System.out.println("Unique: \t \t"+sum2);
jpayne@68 65 System.out.println("CollisionsA:\t \t"+collisionsA);
jpayne@68 66 System.out.println("CollisionsB:\t \t"+collisionsB);
jpayne@68 67
jpayne@68 68 double modifier=(collisionsB)/(double)(32*collisionsA+8*collisionsB);
jpayne@68 69
jpayne@68 70 System.out.println("Estimate: \t \t"+(sum2+collisionsA+collisionsB-(long)(collisionsA*modifier)));
jpayne@68 71 System.out.println();
jpayne@68 72 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
jpayne@68 73 x=sum2-x;
jpayne@68 74 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
jpayne@68 75
jpayne@68 76 }
jpayne@68 77
jpayne@68 78 public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){
jpayne@68 79 assert(indexbits>=1 && indexbits<40);
jpayne@68 80 collisionsA=0;
jpayne@68 81 collisionsB=0;
jpayne@68 82 final long cells=1L<<indexbits;
jpayne@68 83 final int kbits=ROTATE_DIST*k;
jpayne@68 84 final int xorShift=kbits%64;
jpayne@68 85 final long[] rotMasks=makeRotMasks(xorShift);
jpayne@68 86 final int[] buffer=new int[k];
jpayne@68 87 if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);}
jpayne@68 88 if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));}
jpayne@68 89 final KCountArray2 count=new KCountArray2(cells, cbits);
jpayne@68 90
jpayne@68 91 FastqReadInputStream fris1=new FastqReadInputStream(reads1, false);
jpayne@68 92 FastqReadInputStream fris2=(reads2==null ? null : new FastqReadInputStream(reads2, false));
jpayne@68 93 ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, maxReads);
jpayne@68 94
jpayne@68 95 cris.start();
jpayne@68 96 System.err.println("Started cris");
jpayne@68 97 boolean paired=cris.paired();
jpayne@68 98 System.err.println("Paired: "+paired);
jpayne@68 99
jpayne@68 100 long kmer=0; //current kmer
jpayne@68 101 int len=0; //distance since last contig start or ambiguous base
jpayne@68 102
jpayne@68 103 {
jpayne@68 104 ListNum<Read> ln=cris.nextList();
jpayne@68 105 ArrayList<Read> reads=(ln!=null ? ln.list : null);
jpayne@68 106
jpayne@68 107 if(reads!=null && !reads.isEmpty()){
jpayne@68 108 Read r=reads.get(0);
jpayne@68 109 assert(paired==(r.mate!=null));
jpayne@68 110 }
jpayne@68 111
jpayne@68 112 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
jpayne@68 113 //System.err.println("reads.size()="+reads.size());
jpayne@68 114 for(Read r : reads){
jpayne@68 115 readsProcessed++;
jpayne@68 116
jpayne@68 117 len=0;
jpayne@68 118 kmer=0;
jpayne@68 119 byte[] bases=r.bases;
jpayne@68 120 byte[] quals=r.quality;
jpayne@68 121
jpayne@68 122 for(int i=0; i<bases.length; i++){
jpayne@68 123 byte b=bases[i];
jpayne@68 124 int x=AminoAcid.baseToNumber[b];
jpayne@68 125 int x2=buffer[len%buffer.length];
jpayne@68 126 buffer[len%buffer.length]=x;
jpayne@68 127 if(x<0){
jpayne@68 128 len=0;
jpayne@68 129 kmer=0;
jpayne@68 130 }else{
jpayne@68 131 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
jpayne@68 132 len++;
jpayne@68 133 if(len>=k){
jpayne@68 134 if(len>k){kmer=kmer^rotMasks[x2];}
jpayne@68 135 long hashcode=kmer&0x7fffffffffffffffL;
jpayne@68 136 long code1=hashcode%(cells-3);
jpayne@68 137 long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
jpayne@68 138 int value=count.increment2(code1, 1);
jpayne@68 139 long temp=count.read(code2);
jpayne@68 140 if(temp>0){
jpayne@68 141 if(value==0){collisionsA++;}
jpayne@68 142 else{collisionsB++;}
jpayne@68 143 }
jpayne@68 144 }
jpayne@68 145 }
jpayne@68 146 }
jpayne@68 147
jpayne@68 148
jpayne@68 149 if(r.mate!=null){
jpayne@68 150 len=0;
jpayne@68 151 kmer=0;
jpayne@68 152 bases=r.mate.bases;
jpayne@68 153 quals=r.mate.quality;
jpayne@68 154 for(int i=0; i<bases.length; i++){
jpayne@68 155 byte b=bases[i];
jpayne@68 156 int x=AminoAcid.baseToNumber[b];
jpayne@68 157 int x2=buffer[len%buffer.length];
jpayne@68 158 buffer[len%buffer.length]=x;
jpayne@68 159 if(x<0){
jpayne@68 160 len=0;
jpayne@68 161 kmer=0;
jpayne@68 162 }else{
jpayne@68 163 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
jpayne@68 164 len++;
jpayne@68 165 if(len>=k){
jpayne@68 166 if(len>k){kmer=kmer^rotMasks[x2];}
jpayne@68 167 long hashcode=kmer&0x7fffffffffffffffL;
jpayne@68 168 long code1=hashcode%(cells-3);
jpayne@68 169 long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
jpayne@68 170 int value=count.increment2(code1, 1);
jpayne@68 171 long temp=count.read(code2);
jpayne@68 172 if(temp>0){
jpayne@68 173 if(value==0){collisionsA++;}
jpayne@68 174 else{collisionsB++;}
jpayne@68 175 }
jpayne@68 176 }
jpayne@68 177 }
jpayne@68 178 }
jpayne@68 179 }
jpayne@68 180
jpayne@68 181 }
jpayne@68 182 //System.err.println("returning list");
jpayne@68 183 cris.returnList(ln);
jpayne@68 184 //System.err.println("fetching list");
jpayne@68 185 ln=cris.nextList();
jpayne@68 186 reads=(ln!=null ? ln.list : null);
jpayne@68 187 }
jpayne@68 188 System.err.println("Finished reading");
jpayne@68 189 cris.returnList(ln);
jpayne@68 190 System.err.println("Returned list");
jpayne@68 191 ReadWrite.closeStream(cris);
jpayne@68 192 System.err.println("Closed stream");
jpayne@68 193 System.err.println("Processed "+readsProcessed+" reads.");
jpayne@68 194 }
jpayne@68 195
jpayne@68 196 return count;
jpayne@68 197 }
jpayne@68 198
jpayne@68 199 public static final long[] makeRotMasks(int rotDist){
jpayne@68 200 long[] masks=new long[4];
jpayne@68 201 for(long i=0; i<4; i++){
jpayne@68 202 masks[(int)i]=Long.rotateLeft(i, rotDist);
jpayne@68 203 }
jpayne@68 204 return masks;
jpayne@68 205 }
jpayne@68 206
jpayne@68 207 public static long[] transformToFrequency(int[] count){
jpayne@68 208 long[] freq=new long[2000];
jpayne@68 209 int max=freq.length-1;
jpayne@68 210 for(int i=0; i<count.length; i++){
jpayne@68 211 int x=count[i];
jpayne@68 212 x=min(x, max);
jpayne@68 213 freq[x]++;
jpayne@68 214 }
jpayne@68 215 return freq;
jpayne@68 216 }
jpayne@68 217
jpayne@68 218 public static long sum(int[] array){
jpayne@68 219 long x=0;
jpayne@68 220 for(int y : array){x+=y;}
jpayne@68 221 return x;
jpayne@68 222 }
jpayne@68 223
jpayne@68 224 public static long sum(long[] array){
jpayne@68 225 long x=0;
jpayne@68 226 for(long y : array){x+=y;}
jpayne@68 227 return x;
jpayne@68 228 }
jpayne@68 229
jpayne@68 230 public static final int min(int x, int y){return x<y ? x : y;}
jpayne@68 231 public static final int max(int x, int y){return x>y ? x : y;}
jpayne@68 232
jpayne@68 233 public static boolean verbose=true;
jpayne@68 234 public static byte minQuality=-5;
jpayne@68 235 public static long readsProcessed=0;
jpayne@68 236 public static long maxReads=1000000L;
jpayne@68 237 public static final int ROTATE_DIST=2;
jpayne@68 238
jpayne@68 239 public static long collisionsA=0;
jpayne@68 240 public static long collisionsB=0;
jpayne@68 241
jpayne@68 242 }