comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount2.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package bloom;
2
3 import java.util.ArrayList;
4 import java.util.Locale;
5 import java.util.Random;
6
7 import dna.AminoAcid;
8 import fileIO.FileFormat;
9 import fileIO.ReadWrite;
10 import shared.Timer;
11 import stream.ConcurrentReadInputStream;
12 import stream.FastaReadInputStream;
13 import stream.Read;
14 import structures.ListNum;
15
16 /**
17 * @author Brian Bushnell
18 * @date Jul 6, 2012
19 *
20 */
21 public class LargeKmerCount2 {
22
23 public static void main(String[] args){
24
25 Timer t=new Timer();
26
27 String fname1=args[0];
28 String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null);
29 int indexbits=Integer.parseInt(args[args.length-3]);
30 int cbits=Integer.parseInt(args[args.length-2]);
31 int k=Integer.parseInt(args[args.length-1]);
32
33 KCountArray2 count=null;
34
35 if(fileIO.FileFormat.hasFastaExtension(fname1)){
36 FastaReadInputStream.MIN_READ_LEN=k;
37 }
38 count=countFastq(fname1, fname2, indexbits, cbits, k);
39
40 t.stop();
41 System.out.println("Finished counting; time = "+t);
42
43 long[] freq=count.transformToFrequency();
44
45 // System.out.println(count+"\n");
46 // System.out.println(Arrays.toString(freq)+"\n");
47
48 long sum=sum(freq);
49 System.out.println("Kmer fraction:");
50 int lim1=8, lim2=16;
51 for(int i=0; i<lim1; i++){
52 String prefix=i+"";
53 while(prefix.length()<8){prefix=prefix+" ";}
54 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
55 }
56 while(lim1<=freq.length){
57 int x=0;
58 for(int i=lim1; i<lim2; i++){
59 x+=freq[i];
60 }
61 String prefix=lim1+"-"+(lim2-1);
62 if(lim2>=freq.length){prefix=lim1+"+";}
63 while(prefix.length()<8){prefix=prefix+" ";}
64 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
65 lim1*=2;
66 lim2=min(lim2*2, freq.length);
67 }
68
69 long estKmers=load+min(actualCollisions, (long)expectedCollisions);
70
71 long sum2=sum-freq[0];
72 long x=freq[1];
73 System.out.println();
74 System.out.println("Keys Counted: \t \t"+keysCounted);
75 System.out.println("Unique: \t \t"+sum2);
76 System.out.println("probCollisions:\t \t"+(long)probNewKeyCollisions);
77 System.out.println("EstimateP: \t \t"+(sum2+(long)probNewKeyCollisions));
78 System.out.println("expectedColl: \t \t"+(long)expectedCollisions);
79 System.out.println("actualColl: \t \t"+(long)actualCollisions);
80 System.out.println("estimateKmers: \t \t"+estKmers);
81 System.out.println();
82 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
83 x=sum2-x;
84 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
85
86 }
87
88 public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){
89 assert(indexbits>=1 && indexbits<40);
90 final long cells=1L<<indexbits;
91 final int kbits=ROTATE_DIST*k;
92 final int xorShift=kbits%64;
93 final long[] rotMasks=makeRotMasks(xorShift);
94 final int[] buffer=new int[k];
95 if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);}
96 if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));}
97 final KCountArray2 count=new KCountArray2(cells, cbits);
98 load=0;
99 probNewKeyCollisions=0;
100 invCells=1d/cells;
101 invKmerSpace=Math.pow(0.5, 2*k);
102 if(cells>=Math.pow(4, k)){invCells=0;}
103
104 final ConcurrentReadInputStream cris;
105 {
106 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
107 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
108 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
109 if(verbose){System.err.println("Started cris");}
110 cris.start(); //4567
111 }
112 boolean paired=cris.paired();
113 if(verbose){System.err.println("Paired: "+paired);}
114
115 long kmer=0; //current kmer
116 int len=0; //distance since last contig start or ambiguous base
117
118 {
119 ListNum<Read> ln=cris.nextList();
120 ArrayList<Read> reads=(ln!=null ? ln.list : null);
121
122 if(reads!=null && !reads.isEmpty()){
123 Read r=reads.get(0);
124 assert(paired==(r.mate!=null));
125 }
126
127 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
128 //System.err.println("reads.size()="+reads.size());
129 for(Read r : reads){
130 readsProcessed++;
131
132 len=0;
133 kmer=0;
134 byte[] bases=r.bases;
135 byte[] quals=r.quality;
136
137 for(int i=0; i<bases.length; i++){
138 byte b=bases[i];
139 int x=AminoAcid.baseToNumber[b];
140 int x2=buffer[len%buffer.length];
141 buffer[len%buffer.length]=x;
142 if(x<0){
143 len=0;
144 kmer=0;
145 }else{
146 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
147 len++;
148 if(len>=k){
149 keysCounted++;
150 if(len>k){kmer=kmer^rotMasks[x2];}
151 long hashcode=kmer&0x7fffffffffffffffL;
152 // hashcode=randy.nextLong()&~((-1L)<<(2*k));
153 long code1=hashcode%(cells-3);
154 // long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
155 int value=count.increment2(code1, 1);
156
157 double probCollision=load*invCells;
158 // expectedCollisions+=probCollision;
159 expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace);
160 if(value==0){load++;}
161 else{
162 actualCollisions++;
163 double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions));
164 double estKeys=load+probNewKeyCollisions;
165 double probOldKey=estKeys*invKmerSpace;
166 probNewKeyCollisions+=probNewKey*(1-probOldKey);
167
168 // double estKmers=load+min(actualCollisions, expectedCollisions);
169 // double probOldKmer=estKmers*invKmerSpace;
170 // probNewKeyCollisions+=(prob*(1-prob2));
171 }
172
173 //// probCollisions+=(load*invCells);
174 // if(value==0){load++;}
175 // else{
176 //// long load2=keysCounted-load;
177 // double prob=Math.sqrt(load*invCells);
178 // double estKmers=load+probNewKeyCollisions;
179 // double prob2=estKmers*invKmerSpace;
180 //// probCollisions+=(prob*(1-prob2));
181 //// probCollisions+=Math.sqrt(prob*(1-prob2));
182 // probNewKeyCollisions+=Math.sqrt(prob*(1-prob2));
183 //// probCollisions+=min(prob, 1-prob2);
184 //// probCollisions+=(load*invCells);
185 // }
186 }
187 }
188 }
189
190
191 if(r.mate!=null){
192 len=0;
193 kmer=0;
194 bases=r.mate.bases;
195 quals=r.mate.quality;
196 for(int i=0; i<bases.length; i++){
197 byte b=bases[i];
198 int x=AminoAcid.baseToNumber[b];
199 int x2=buffer[len%buffer.length];
200 buffer[len%buffer.length]=x;
201 if(x<0){
202 len=0;
203 kmer=0;
204 }else{
205 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
206 len++;
207 if(len>=k){
208 keysCounted++;
209 if(len>k){kmer=kmer^rotMasks[x2];}
210 long hashcode=kmer&0x7fffffffffffffffL;
211 // hashcode=randy.nextLong()&~((-1L)<<(2*k));
212 long code1=hashcode%(cells-3);
213 // long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
214 int value=count.increment2(code1, 1);
215
216 double probCollision=load*invCells;
217 // expectedCollisions+=probCollision;
218 expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace);
219 if(value==0){load++;}
220 else{
221 actualCollisions++;
222 double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions));
223 double estKeys=load+probNewKeyCollisions;
224 double probOldKey=estKeys*invKmerSpace;
225 probNewKeyCollisions+=probNewKey*(1-probOldKey);
226
227 // double estKmers=load+min(actualCollisions, expectedCollisions);
228 // double probOldKmer=estKmers*invKmerSpace;
229 // probNewKeyCollisions+=(prob*(1-prob2));
230 }
231
232 //// probCollisions+=(load*invCells);
233 // if(value==0){load++;}
234 // else{
235 //// long load2=keysCounted-load;
236 // double prob=Math.sqrt(load*invCells);
237 // double estKmers=load+probNewKeyCollisions;
238 // double prob2=estKmers*invKmerSpace;
239 //// probCollisions+=(prob*(1-prob2));
240 //// probCollisions+=Math.sqrt(prob*(1-prob2));
241 // probNewKeyCollisions+=Math.sqrt(prob*(1-prob2));
242 //// probCollisions+=min(prob, 1-prob2);
243 //// probCollisions+=(load*invCells);
244 // }
245 }
246 }
247 }
248 }
249
250 }
251 //System.err.println("returning list");
252 cris.returnList(ln);
253 //System.err.println("fetching list");
254 ln=cris.nextList();
255 reads=(ln!=null ? ln.list : null);
256 }
257 System.err.println("Finished reading");
258 cris.returnList(ln);
259 System.err.println("Returned list");
260 ReadWrite.closeStream(cris);
261 System.err.println("Closed stream");
262 System.err.println("Processed "+readsProcessed+" reads.");
263 }
264
265 return count;
266 }
267
268 public static final long[] makeRotMasks(int rotDist){
269 long[] masks=new long[4];
270 for(long i=0; i<4; i++){
271 masks[(int)i]=Long.rotateLeft(i, rotDist);
272 }
273 return masks;
274 }
275
276 public static long[] transformToFrequency(int[] count){
277 long[] freq=new long[2000];
278 int max=freq.length-1;
279 for(int i=0; i<count.length; i++){
280 int x=count[i];
281 x=min(x, max);
282 freq[x]++;
283 }
284 return freq;
285 }
286
287 public static long sum(int[] array){
288 long x=0;
289 for(int y : array){x+=y;}
290 return x;
291 }
292
293 public static long sum(long[] array){
294 long x=0;
295 for(long y : array){x+=y;}
296 return x;
297 }
298
299 public static final int min(int x, int y){return x<y ? x : y;}
300 public static final int max(int x, int y){return x>y ? x : y;}
301 public static final long min(long x, long y){return x<y ? x : y;}
302 public static final long max(long x, long y){return x>y ? x : y;}
303 public static final double min(double x, double y){return x<y ? x : y;}
304 public static final double max(double x, double y){return x>y ? x : y;}
305
306 public static boolean verbose=true;
307 public static byte minQuality=-5;
308 public static long readsProcessed=0;
309 public static long maxReads=10000000L;
310 public static final int ROTATE_DIST=2;
311
312 /** Non-empty cells in hash table */
313 public static long load;
314 /** Number of expected collisions */
315 public static double expectedCollisions;
316 /** Number of actual collisions (possibly by same value) */
317 public static long actualCollisions;
318 /** Number of probable collisions caused by new keys */
319 public static double probNewKeyCollisions;
320 /** Inverse of hash table size */
321 public static double invCells;
322 /** Inverse of number of potential kmers */
323 public static double invKmerSpace;
324 /** Inverse of number of potential kmers */
325 public static long keysCounted;
326
327 public static final Random randy=new Random(1);
328
329 }