Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/LargeKmerCount.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package bloom; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.Locale; | |
5 | |
6 import dna.AminoAcid; | |
7 import fileIO.ReadWrite; | |
8 import shared.Timer; | |
9 import stream.ConcurrentGenericReadInputStream; | |
10 import stream.FastqReadInputStream; | |
11 import stream.Read; | |
12 import structures.ListNum; | |
13 | |
14 /** | |
15 * @author Brian Bushnell | |
16 * @date Jul 6, 2012 | |
17 * | |
18 */ | |
19 public class LargeKmerCount { | |
20 | |
21 public static void main(String[] args){ | |
22 | |
23 Timer t=new Timer(); | |
24 | |
25 String fname1=args[0]; | |
26 String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null); | |
27 int indexbits=Integer.parseInt(args[args.length-3]); | |
28 int cbits=Integer.parseInt(args[args.length-2]); | |
29 int k=Integer.parseInt(args[args.length-1]); | |
30 | |
31 KCountArray2 count=countFastq(fname1, fname2, indexbits, cbits, k); | |
32 t.stop(); | |
33 System.out.println("Finished counting; time = "+t); | |
34 | |
35 long[] freq=count.transformToFrequency(); | |
36 | |
37 // System.out.println(count+"\n"); | |
38 // System.out.println(Arrays.toString(freq)+"\n"); | |
39 | |
40 long sum=sum(freq); | |
41 System.out.println("Kmer fraction:"); | |
42 int lim1=8, lim2=16; | |
43 for(int i=0; i<lim1; i++){ | |
44 String prefix=i+""; | |
45 while(prefix.length()<8){prefix=prefix+" ";} | |
46 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); | |
47 } | |
48 while(lim1<=freq.length){ | |
49 int x=0; | |
50 for(int i=lim1; i<lim2; i++){ | |
51 x+=freq[i]; | |
52 } | |
53 String prefix=lim1+"-"+(lim2-1); | |
54 if(lim2>=freq.length){prefix=lim1+"+";} | |
55 while(prefix.length()<8){prefix=prefix+" ";} | |
56 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); | |
57 lim1*=2; | |
58 lim2=min(lim2*2, freq.length); | |
59 } | |
60 | |
61 long sum2=sum-freq[0]; | |
62 long x=freq[1]; | |
63 System.out.println(); | |
64 System.out.println("Unique: \t \t"+sum2); | |
65 System.out.println("CollisionsA:\t \t"+collisionsA); | |
66 System.out.println("CollisionsB:\t \t"+collisionsB); | |
67 | |
68 double modifier=(collisionsB)/(double)(32*collisionsA+8*collisionsB); | |
69 | |
70 System.out.println("Estimate: \t \t"+(sum2+collisionsA+collisionsB-(long)(collisionsA*modifier))); | |
71 System.out.println(); | |
72 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
73 x=sum2-x; | |
74 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
75 | |
76 } | |
77 | |
78 public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){ | |
79 assert(indexbits>=1 && indexbits<40); | |
80 collisionsA=0; | |
81 collisionsB=0; | |
82 final long cells=1L<<indexbits; | |
83 final int kbits=ROTATE_DIST*k; | |
84 final int xorShift=kbits%64; | |
85 final long[] rotMasks=makeRotMasks(xorShift); | |
86 final int[] buffer=new int[k]; | |
87 if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);} | |
88 if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));} | |
89 final KCountArray2 count=new KCountArray2(cells, cbits); | |
90 | |
91 FastqReadInputStream fris1=new FastqReadInputStream(reads1, false); | |
92 FastqReadInputStream fris2=(reads2==null ? null : new FastqReadInputStream(reads2, false)); | |
93 ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, maxReads); | |
94 | |
95 cris.start(); | |
96 System.err.println("Started cris"); | |
97 boolean paired=cris.paired(); | |
98 System.err.println("Paired: "+paired); | |
99 | |
100 long kmer=0; //current kmer | |
101 int len=0; //distance since last contig start or ambiguous base | |
102 | |
103 { | |
104 ListNum<Read> ln=cris.nextList(); | |
105 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
106 | |
107 if(reads!=null && !reads.isEmpty()){ | |
108 Read r=reads.get(0); | |
109 assert(paired==(r.mate!=null)); | |
110 } | |
111 | |
112 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
113 //System.err.println("reads.size()="+reads.size()); | |
114 for(Read r : reads){ | |
115 readsProcessed++; | |
116 | |
117 len=0; | |
118 kmer=0; | |
119 byte[] bases=r.bases; | |
120 byte[] quals=r.quality; | |
121 | |
122 for(int i=0; i<bases.length; i++){ | |
123 byte b=bases[i]; | |
124 int x=AminoAcid.baseToNumber[b]; | |
125 int x2=buffer[len%buffer.length]; | |
126 buffer[len%buffer.length]=x; | |
127 if(x<0){ | |
128 len=0; | |
129 kmer=0; | |
130 }else{ | |
131 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); | |
132 len++; | |
133 if(len>=k){ | |
134 if(len>k){kmer=kmer^rotMasks[x2];} | |
135 long hashcode=kmer&0x7fffffffffffffffL; | |
136 long code1=hashcode%(cells-3); | |
137 long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); | |
138 int value=count.increment2(code1, 1); | |
139 long temp=count.read(code2); | |
140 if(temp>0){ | |
141 if(value==0){collisionsA++;} | |
142 else{collisionsB++;} | |
143 } | |
144 } | |
145 } | |
146 } | |
147 | |
148 | |
149 if(r.mate!=null){ | |
150 len=0; | |
151 kmer=0; | |
152 bases=r.mate.bases; | |
153 quals=r.mate.quality; | |
154 for(int i=0; i<bases.length; i++){ | |
155 byte b=bases[i]; | |
156 int x=AminoAcid.baseToNumber[b]; | |
157 int x2=buffer[len%buffer.length]; | |
158 buffer[len%buffer.length]=x; | |
159 if(x<0){ | |
160 len=0; | |
161 kmer=0; | |
162 }else{ | |
163 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x); | |
164 len++; | |
165 if(len>=k){ | |
166 if(len>k){kmer=kmer^rotMasks[x2];} | |
167 long hashcode=kmer&0x7fffffffffffffffL; | |
168 long code1=hashcode%(cells-3); | |
169 long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); | |
170 int value=count.increment2(code1, 1); | |
171 long temp=count.read(code2); | |
172 if(temp>0){ | |
173 if(value==0){collisionsA++;} | |
174 else{collisionsB++;} | |
175 } | |
176 } | |
177 } | |
178 } | |
179 } | |
180 | |
181 } | |
182 //System.err.println("returning list"); | |
183 cris.returnList(ln); | |
184 //System.err.println("fetching list"); | |
185 ln=cris.nextList(); | |
186 reads=(ln!=null ? ln.list : null); | |
187 } | |
188 System.err.println("Finished reading"); | |
189 cris.returnList(ln); | |
190 System.err.println("Returned list"); | |
191 ReadWrite.closeStream(cris); | |
192 System.err.println("Closed stream"); | |
193 System.err.println("Processed "+readsProcessed+" reads."); | |
194 } | |
195 | |
196 return count; | |
197 } | |
198 | |
199 public static final long[] makeRotMasks(int rotDist){ | |
200 long[] masks=new long[4]; | |
201 for(long i=0; i<4; i++){ | |
202 masks[(int)i]=Long.rotateLeft(i, rotDist); | |
203 } | |
204 return masks; | |
205 } | |
206 | |
207 public static long[] transformToFrequency(int[] count){ | |
208 long[] freq=new long[2000]; | |
209 int max=freq.length-1; | |
210 for(int i=0; i<count.length; i++){ | |
211 int x=count[i]; | |
212 x=min(x, max); | |
213 freq[x]++; | |
214 } | |
215 return freq; | |
216 } | |
217 | |
218 public static long sum(int[] array){ | |
219 long x=0; | |
220 for(int y : array){x+=y;} | |
221 return x; | |
222 } | |
223 | |
224 public static long sum(long[] array){ | |
225 long x=0; | |
226 for(long y : array){x+=y;} | |
227 return x; | |
228 } | |
229 | |
230 public static final int min(int x, int y){return x<y ? x : y;} | |
231 public static final int max(int x, int y){return x>y ? x : y;} | |
232 | |
233 public static boolean verbose=true; | |
234 public static byte minQuality=-5; | |
235 public static long readsProcessed=0; | |
236 public static long maxReads=1000000L; | |
237 public static final int ROTATE_DIST=2; | |
238 | |
239 public static long collisionsA=0; | |
240 public static long collisionsB=0; | |
241 | |
242 } |