Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount2.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package bloom; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.Locale; | |
5 import java.util.Random; | |
6 | |
7 import dna.AminoAcid; | |
8 import fileIO.FileFormat; | |
9 import fileIO.ReadWrite; | |
10 import shared.Timer; | |
11 import stream.ConcurrentReadInputStream; | |
12 import stream.FastaReadInputStream; | |
13 import stream.Read; | |
14 import structures.ListNum; | |
15 | |
16 /** | |
17 * @author Brian Bushnell | |
18 * @date Jul 6, 2012 | |
19 * | |
20 */ | |
21 public class LargeKmerCount2 { | |
22 | |
23 public static void main(String[] args){ | |
24 | |
25 Timer t=new Timer(); | |
26 | |
27 String fname1=args[0]; | |
28 String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null); | |
29 int indexbits=Integer.parseInt(args[args.length-3]); | |
30 int cbits=Integer.parseInt(args[args.length-2]); | |
31 int k=Integer.parseInt(args[args.length-1]); | |
32 | |
33 KCountArray2 count=null; | |
34 | |
35 if(fileIO.FileFormat.hasFastaExtension(fname1)){ | |
36 FastaReadInputStream.MIN_READ_LEN=k; | |
37 } | |
38 count=countFastq(fname1, fname2, indexbits, cbits, k); | |
39 | |
40 t.stop(); | |
41 System.out.println("Finished counting; time = "+t); | |
42 | |
43 long[] freq=count.transformToFrequency(); | |
44 | |
45 // System.out.println(count+"\n"); | |
46 // System.out.println(Arrays.toString(freq)+"\n"); | |
47 | |
48 long sum=sum(freq); | |
49 System.out.println("Kmer fraction:"); | |
50 int lim1=8, lim2=16; | |
51 for(int i=0; i<lim1; i++){ | |
52 String prefix=i+""; | |
53 while(prefix.length()<8){prefix=prefix+" ";} | |
54 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); | |
55 } | |
56 while(lim1<=freq.length){ | |
57 int x=0; | |
58 for(int i=lim1; i<lim2; i++){ | |
59 x+=freq[i]; | |
60 } | |
61 String prefix=lim1+"-"+(lim2-1); | |
62 if(lim2>=freq.length){prefix=lim1+"+";} | |
63 while(prefix.length()<8){prefix=prefix+" ";} | |
64 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); | |
65 lim1*=2; | |
66 lim2=min(lim2*2, freq.length); | |
67 } | |
68 | |
69 long estKmers=load+min(actualCollisions, (long)expectedCollisions); | |
70 | |
71 long sum2=sum-freq[0]; | |
72 long x=freq[1]; | |
73 System.out.println(); | |
74 System.out.println("Keys Counted: \t \t"+keysCounted); | |
75 System.out.println("Unique: \t \t"+sum2); | |
76 System.out.println("probCollisions:\t \t"+(long)probNewKeyCollisions); | |
77 System.out.println("EstimateP: \t \t"+(sum2+(long)probNewKeyCollisions)); | |
78 System.out.println("expectedColl: \t \t"+(long)expectedCollisions); | |
79 System.out.println("actualColl: \t \t"+(long)actualCollisions); | |
80 System.out.println("estimateKmers: \t \t"+estKmers); | |
81 System.out.println(); | |
82 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
83 x=sum2-x; | |
84 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
85 | |
86 } | |
87 | |
88 public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){ | |
89 assert(indexbits>=1 && indexbits<40); | |
90 final long cells=1L<<indexbits; | |
91 final int kbits=ROTATE_DIST*k; | |
92 final int xorShift=kbits%64; | |
93 final long[] rotMasks=makeRotMasks(xorShift); | |
94 final int[] buffer=new int[k]; | |
95 if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);} | |
96 if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));} | |
97 final KCountArray2 count=new KCountArray2(cells, cbits); | |
98 load=0; | |
99 probNewKeyCollisions=0; | |
100 invCells=1d/cells; | |
101 invKmerSpace=Math.pow(0.5, 2*k); | |
102 if(cells>=Math.pow(4, k)){invCells=0;} | |
103 | |
104 final ConcurrentReadInputStream cris; | |
105 { | |
106 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
107 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
108 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
109 if(verbose){System.err.println("Started cris");} | |
110 cris.start(); //4567 | |
111 } | |
112 boolean paired=cris.paired(); | |
113 if(verbose){System.err.println("Paired: "+paired);} | |
114 | |
115 long kmer=0; //current kmer | |
116 int len=0; //distance since last contig start or ambiguous base | |
117 | |
118 { | |
119 ListNum<Read> ln=cris.nextList(); | |
120 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
121 | |
122 if(reads!=null && !reads.isEmpty()){ | |
123 Read r=reads.get(0); | |
124 assert(paired==(r.mate!=null)); | |
125 } | |
126 | |
127 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
128 //System.err.println("reads.size()="+reads.size()); | |
129 for(Read r : reads){ | |
130 readsProcessed++; | |
131 | |
132 len=0; | |
133 kmer=0; | |
134 byte[] bases=r.bases; | |
135 byte[] quals=r.quality; | |
136 | |
137 for(int i=0; i<bases.length; i++){ | |
138 byte b=bases[i]; | |
139 int x=AminoAcid.baseToNumber[b]; | |
140 int x2=buffer[len%buffer.length]; | |
141 buffer[len%buffer.length]=x; | |
142 if(x<0){ | |
143 len=0; | |
144 kmer=0; | |
145 }else{ | |
146 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); | |
147 len++; | |
148 if(len>=k){ | |
149 keysCounted++; | |
150 if(len>k){kmer=kmer^rotMasks[x2];} | |
151 long hashcode=kmer&0x7fffffffffffffffL; | |
152 // hashcode=randy.nextLong()&~((-1L)<<(2*k)); | |
153 long code1=hashcode%(cells-3); | |
154 // long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); | |
155 int value=count.increment2(code1, 1); | |
156 | |
157 double probCollision=load*invCells; | |
158 // expectedCollisions+=probCollision; | |
159 expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); | |
160 if(value==0){load++;} | |
161 else{ | |
162 actualCollisions++; | |
163 double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); | |
164 double estKeys=load+probNewKeyCollisions; | |
165 double probOldKey=estKeys*invKmerSpace; | |
166 probNewKeyCollisions+=probNewKey*(1-probOldKey); | |
167 | |
168 // double estKmers=load+min(actualCollisions, expectedCollisions); | |
169 // double probOldKmer=estKmers*invKmerSpace; | |
170 // probNewKeyCollisions+=(prob*(1-prob2)); | |
171 } | |
172 | |
173 //// probCollisions+=(load*invCells); | |
174 // if(value==0){load++;} | |
175 // else{ | |
176 //// long load2=keysCounted-load; | |
177 // double prob=Math.sqrt(load*invCells); | |
178 // double estKmers=load+probNewKeyCollisions; | |
179 // double prob2=estKmers*invKmerSpace; | |
180 //// probCollisions+=(prob*(1-prob2)); | |
181 //// probCollisions+=Math.sqrt(prob*(1-prob2)); | |
182 // probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); | |
183 //// probCollisions+=min(prob, 1-prob2); | |
184 //// probCollisions+=(load*invCells); | |
185 // } | |
186 } | |
187 } | |
188 } | |
189 | |
190 | |
191 if(r.mate!=null){ | |
192 len=0; | |
193 kmer=0; | |
194 bases=r.mate.bases; | |
195 quals=r.mate.quality; | |
196 for(int i=0; i<bases.length; i++){ | |
197 byte b=bases[i]; | |
198 int x=AminoAcid.baseToNumber[b]; | |
199 int x2=buffer[len%buffer.length]; | |
200 buffer[len%buffer.length]=x; | |
201 if(x<0){ | |
202 len=0; | |
203 kmer=0; | |
204 }else{ | |
205 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); | |
206 len++; | |
207 if(len>=k){ | |
208 keysCounted++; | |
209 if(len>k){kmer=kmer^rotMasks[x2];} | |
210 long hashcode=kmer&0x7fffffffffffffffL; | |
211 // hashcode=randy.nextLong()&~((-1L)<<(2*k)); | |
212 long code1=hashcode%(cells-3); | |
213 // long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); | |
214 int value=count.increment2(code1, 1); | |
215 | |
216 double probCollision=load*invCells; | |
217 // expectedCollisions+=probCollision; | |
218 expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); | |
219 if(value==0){load++;} | |
220 else{ | |
221 actualCollisions++; | |
222 double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); | |
223 double estKeys=load+probNewKeyCollisions; | |
224 double probOldKey=estKeys*invKmerSpace; | |
225 probNewKeyCollisions+=probNewKey*(1-probOldKey); | |
226 | |
227 // double estKmers=load+min(actualCollisions, expectedCollisions); | |
228 // double probOldKmer=estKmers*invKmerSpace; | |
229 // probNewKeyCollisions+=(prob*(1-prob2)); | |
230 } | |
231 | |
232 //// probCollisions+=(load*invCells); | |
233 // if(value==0){load++;} | |
234 // else{ | |
235 //// long load2=keysCounted-load; | |
236 // double prob=Math.sqrt(load*invCells); | |
237 // double estKmers=load+probNewKeyCollisions; | |
238 // double prob2=estKmers*invKmerSpace; | |
239 //// probCollisions+=(prob*(1-prob2)); | |
240 //// probCollisions+=Math.sqrt(prob*(1-prob2)); | |
241 // probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); | |
242 //// probCollisions+=min(prob, 1-prob2); | |
243 //// probCollisions+=(load*invCells); | |
244 // } | |
245 } | |
246 } | |
247 } | |
248 } | |
249 | |
250 } | |
251 //System.err.println("returning list"); | |
252 cris.returnList(ln); | |
253 //System.err.println("fetching list"); | |
254 ln=cris.nextList(); | |
255 reads=(ln!=null ? ln.list : null); | |
256 } | |
257 System.err.println("Finished reading"); | |
258 cris.returnList(ln); | |
259 System.err.println("Returned list"); | |
260 ReadWrite.closeStream(cris); | |
261 System.err.println("Closed stream"); | |
262 System.err.println("Processed "+readsProcessed+" reads."); | |
263 } | |
264 | |
265 return count; | |
266 } | |
267 | |
268 public static final long[] makeRotMasks(int rotDist){ | |
269 long[] masks=new long[4]; | |
270 for(long i=0; i<4; i++){ | |
271 masks[(int)i]=Long.rotateLeft(i, rotDist); | |
272 } | |
273 return masks; | |
274 } | |
275 | |
276 public static long[] transformToFrequency(int[] count){ | |
277 long[] freq=new long[2000]; | |
278 int max=freq.length-1; | |
279 for(int i=0; i<count.length; i++){ | |
280 int x=count[i]; | |
281 x=min(x, max); | |
282 freq[x]++; | |
283 } | |
284 return freq; | |
285 } | |
286 | |
287 public static long sum(int[] array){ | |
288 long x=0; | |
289 for(int y : array){x+=y;} | |
290 return x; | |
291 } | |
292 | |
293 public static long sum(long[] array){ | |
294 long x=0; | |
295 for(long y : array){x+=y;} | |
296 return x; | |
297 } | |
298 | |
299 public static final int min(int x, int y){return x<y ? x : y;} | |
300 public static final int max(int x, int y){return x>y ? x : y;} | |
301 public static final long min(long x, long y){return x<y ? x : y;} | |
302 public static final long max(long x, long y){return x>y ? x : y;} | |
303 public static final double min(double x, double y){return x<y ? x : y;} | |
304 public static final double max(double x, double y){return x>y ? x : y;} | |
305 | |
306 public static boolean verbose=true; | |
307 public static byte minQuality=-5; | |
308 public static long readsProcessed=0; | |
309 public static long maxReads=10000000L; | |
310 public static final int ROTATE_DIST=2; | |
311 | |
312 /** Non-empty cells in hash table */ | |
313 public static long load; | |
314 /** Number of expected collisions */ | |
315 public static double expectedCollisions; | |
316 /** Number of actual collisions (possibly by same value) */ | |
317 public static long actualCollisions; | |
318 /** Number of probable collisions caused by new keys */ | |
319 public static double probNewKeyCollisions; | |
320 /** Inverse of hash table size */ | |
321 public static double invCells; | |
322 /** Inverse of number of potential kmers */ | |
323 public static double invKmerSpace; | |
324 /** Inverse of number of potential kmers */ | |
325 public static long keysCounted; | |
326 | |
327 public static final Random randy=new Random(1); | |
328 | |
329 } |