Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Splitter.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package clump; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.LinkedHashMap; | |
5 import java.util.Map.Entry; | |
6 | |
7 import dna.AminoAcid; | |
8 import shared.KillSwitch; | |
9 import shared.Tools; | |
10 import stream.Read; | |
11 import structures.IntList; | |
12 import structures.LongList; | |
13 | |
14 /** | |
15 * A tool for splitting clumps by allele. | |
16 * @author Brian Bushnell | |
17 * @date September 26, 2016 | |
18 * | |
19 */ | |
20 class Splitter { | |
21 | |
22 static ArrayList<Clump> splitOnPivot(Clump c){ | |
23 ArrayList<Clump> list=new ArrayList<Clump>(3); | |
24 list.add(c); | |
25 if(c.size()<minSizeSplit){ | |
26 // assert(findBestPivot(c)<0); | |
27 return list; | |
28 } | |
29 return splitOnPivot(list); | |
30 } | |
31 | |
32 static ArrayList<Clump> splitOnPivot(ArrayList<Clump> list){ | |
33 ArrayList<Clump> out=new ArrayList<Clump>(); | |
34 | |
35 final IntList pivots=new IntList(2); | |
36 for(int i=0; i<list.size(); i++){ | |
37 Clump clump=list.get(i); | |
38 list.set(i, null); | |
39 int pivot=findBestPivots(clump, FIND_CORRELATIONS, pivots); | |
40 if(pivot<0){ | |
41 assert(pivots.size==0); | |
42 out.add(clump); | |
43 }else{ | |
44 assert(pivots.size==1 || pivots.size==2) : pivot+", "+pivots.size+", "+pivots; | |
45 assert(pivots.get(0)==pivot); | |
46 splitAndAdd(clump, pivots.get(0), (pivots.size>1 ? pivots.get(1) : -1), list); | |
47 } | |
48 } | |
49 return out; | |
50 } | |
51 | |
52 static void splitAndAdd(Clump c, final int var1, final int var2, ArrayList<Clump> out) { | |
53 final int maxLeft=c.maxLeft(); | |
54 | |
55 final Clump major=Clump.makeClump(c.kmer), minor=Clump.makeClump(c.kmer); | |
56 | |
57 for(Read r : c){ | |
58 if(containsVar(var1, r, maxLeft) || (var2>=0 && containsVar(var2, r, maxLeft))){ | |
59 minor.add(r); | |
60 }else{ | |
61 major.add(r); | |
62 } | |
63 } | |
64 // System.err.println(major.size()+"\t"+minor.size()); | |
65 out.add(major); | |
66 out.add(minor); | |
67 } | |
68 | |
69 //Returns the c | |
70 static int countVariants(Clump c, LongList varCounts){ | |
71 varCounts.clear(); | |
72 final int[][] bcounts=c.baseCounts(); | |
73 final int[][] qcounts=c.qualityCounts(); | |
74 final int len=bcounts[0].length; | |
75 for(int i=0; i<len; i++){ | |
76 final int major=c.getConsensusAtPosition(qcounts, i); | |
77 for(int x=0; x<4; x++){ | |
78 final int bcount=bcounts[x][i]; | |
79 if(bcount>1 && x!=major){ | |
80 final long var=(((long)bcount)<<32)|((i<<shift)|x); | |
81 varCounts.add(var); | |
82 } | |
83 } | |
84 } | |
85 if(varCounts.size()<1){return 0;} | |
86 varCounts.sort(); | |
87 varCounts.reverse(); | |
88 return (int)(varCounts.get(0)>>>32); | |
89 } | |
90 | |
91 static LinkedHashMap<Integer, ArrayList<Read>> findReadVariants(Clump c, boolean makeMap){ | |
92 if(c.size()<5){return null;} //Not really needed with tiny clumps | |
93 // assert(c.size()>3); //TODO | |
94 LinkedHashMap<Integer, ArrayList<Read>> map=null; | |
95 map=(makeMap ? new LinkedHashMap<Integer, ArrayList<Read>>() : null); | |
96 // if(makeMap){ | |
97 // map=localMap.get(); | |
98 // if(map==null){ | |
99 // map=new LinkedHashMap<Integer, ArrayList<Read>>(); | |
100 // localMap.set(map); | |
101 // } | |
102 // map.clear(); | |
103 // } | |
104 | |
105 final int[][] bcounts=c.baseCounts(); | |
106 final Read ref=c.consensusRead(); | |
107 final byte[] rbases=ref.bases; | |
108 for(Read r : c){ | |
109 final byte[] bases=r.bases; | |
110 final ReadKey key=(ReadKey) r.obj; | |
111 IntList list=key.vars; | |
112 if(list!=null){list.clear();} | |
113 | |
114 | |
115 final int cStart=0, rStart=c.maxLeft()-key.position; | |
116 | |
117 for(int i=cStart, j=rStart; i<bases.length; i++, j++){ | |
118 final byte cb=bases[i], rb=rbases[j]; | |
119 if(cb!=rb){ | |
120 byte x=AminoAcid.baseToNumber[cb]; | |
121 if(x>=0){ | |
122 int count=bcounts[x][j]; | |
123 if(count>1){ | |
124 int var=(j<<2)|x; | |
125 if(list==null){list=key.vars=new IntList(4);} | |
126 list.add(var); | |
127 | |
128 if(map!=null){ | |
129 Integer mapkey=var;//mapKeys[var]; | |
130 ArrayList<Read> alr=map.get(mapkey); | |
131 if(alr==null){ | |
132 alr=new ArrayList<Read>(4); | |
133 map.put(mapkey, alr); | |
134 } | |
135 alr.add(r); | |
136 } | |
137 | |
138 } | |
139 } | |
140 } | |
141 } | |
142 } | |
143 return map==null || map.isEmpty() ? null : map; | |
144 } | |
145 | |
146 static int findBestPivot_Correlated(Clump c, IntList pivots){ | |
147 assert(pivots.size==0); | |
148 LinkedHashMap<Integer, ArrayList<Read>> map=findReadVariants(c, true); | |
149 if(map==null){return -1;} | |
150 | |
151 IntList collection=new IntList(32); | |
152 int[] rvector=KillSwitch.allocInt1D(5); | |
153 | |
154 int bestVar=-1; | |
155 int bestVarCount=-1; | |
156 int bestVar2=-1; | |
157 int bestVar2Count=-1; | |
158 int bestDifferent=-1; | |
159 float bestCorrelation=-1; | |
160 float bestScore=-1; | |
161 | |
162 final float minCorrelation=0.75f; | |
163 final int minVar2Count=2; | |
164 | |
165 int max=0; | |
166 for(Entry<Integer, ArrayList<Read>> entry : map.entrySet()){ | |
167 max=Tools.max(max, entry.getValue().size()); | |
168 } | |
169 if(max<2){return -1;} | |
170 final int thresh=Tools.max(2, max/2); | |
171 final int thresh2=max/4; | |
172 int numOverThresh=0; | |
173 ArrayList<Integer> remove=new ArrayList<Integer>(); | |
174 for(Entry<Integer, ArrayList<Read>> entry : map.entrySet()){ | |
175 int x=entry.getValue().size(); | |
176 if(x>=thresh){numOverThresh++;} | |
177 else if(x<thresh2){remove.add(entry.getKey());} | |
178 } | |
179 for(Integer key : remove){map.remove(key);} | |
180 if(numOverThresh>MAX_CORRELATIONS){return -1;} | |
181 | |
182 for(Entry<Integer, ArrayList<Read>> entry : map.entrySet()){ | |
183 final ArrayList<Read> rlist=entry.getValue(); | |
184 final Integer key=entry.getKey(); | |
185 final int var=key; | |
186 if(rlist.size()>=thresh){ | |
187 | |
188 final int var2=examineVar(var, rlist, collection, rvector, map); | |
189 | |
190 if(var2>=0){ | |
191 | |
192 final int varCount=rvector[1]; | |
193 final int var2Count=rvector[3]; | |
194 final int different=rvector[4]; | |
195 | |
196 final int var2reads; | |
197 final int var2ReadsWithoutVar; | |
198 { | |
199 ArrayList<Read> temp=map.get(var2); | |
200 var2reads=(temp==null ? 0 : temp.size()); | |
201 | |
202 // if(var2reads==var2Count){ | |
203 // var2ReadsWithoutVar=0; | |
204 // }else{ | |
205 // var2ReadsWithoutVar=countDifferentAlleles(var, temp); | |
206 // } | |
207 | |
208 var2ReadsWithoutVar=var2reads-varCount; | |
209 | |
210 } | |
211 | |
212 // final float correlation=(var2Count-0.05f)/(float)varCount; | |
213 // final float correlation=(varCount-different)/(float)varCount; | |
214 final float correlation=(Tools.max(varCount, var2reads)-different)/(float)Tools.max(varCount, var2reads); | |
215 final int distance=Tools.absdif(var>>2, var2>>2); | |
216 | |
217 // final float score=correlation*((var2Count-1)-0.5f*var2ReadsWithoutVar-different+1.0f*(varCount)); | |
218 | |
219 final float score=correlation*(var2reads/*var2Count*/+varCount-different+0.5f*var2ReadsWithoutVar)*(distance+250); | |
220 | |
221 // final float score=correlation*((var2Count-1)-1.0f*var2ReadsWithoutVar+1.0f*(varCount)); | |
222 if(correlation>=minCorrelation && var2Count>=minVar2Count){ | |
223 if(score>bestScore || (score==bestScore && varCount>bestVarCount)){ | |
224 bestVar=var; | |
225 bestVarCount=varCount; | |
226 bestVar2=var2; | |
227 bestVar2Count=var2Count; | |
228 bestCorrelation=correlation; | |
229 bestScore=score; | |
230 bestDifferent=different; | |
231 } | |
232 } | |
233 } | |
234 } | |
235 } | |
236 | |
237 if(bestVar2Count>=minVar2Count && bestCorrelation>=minCorrelation){ | |
238 pivots.add(bestVar); | |
239 pivots.add(bestVar2); | |
240 return bestVar; | |
241 } | |
242 return -1; | |
243 } | |
244 | |
245 static boolean containsVar(final int var, final Read r, final int maxLeft){ | |
246 final int varPos=var>>2; | |
247 final int varAllele=var&alleleMask; | |
248 final ReadKey rk=(ReadKey) r.obj; | |
249 final int rloc=toReadLocation(varPos, maxLeft, rk.position); | |
250 if(rloc<0 || rloc>=r.length()){ | |
251 return false; | |
252 } | |
253 final int readAllele=AminoAcid.baseToNumber[r.bases[rloc]]; | |
254 return readAllele==varAllele; | |
255 } | |
256 | |
257 static boolean hasDifferentAllele(final int var, final Read r/*, final Clump c*/){ | |
258 final int varPos=var>>2; | |
259 final int varAllele=var&alleleMask; | |
260 final ReadKey rk=(ReadKey) r.obj; | |
261 final IntList vars=rk.vars; | |
262 final Clump c=rk.clump; | |
263 assert(c==rk.clump); | |
264 final int maxLeft=c.maxLeft(); | |
265 final int rloc=toReadLocation(varPos, maxLeft, rk.position); | |
266 if(rloc<0 || rloc>=r.length()){ | |
267 assert(!vars.contains(var)); | |
268 return false; | |
269 } | |
270 final int readAllele=AminoAcid.baseToNumber[r.bases[rloc]]; | |
271 assert((readAllele==varAllele)==vars.contains(var)); | |
272 | |
273 return readAllele!=varAllele; | |
274 } | |
275 | |
276 static int countDifferentAlleles(final int var, ArrayList<Read> list){ | |
277 if(list==null){return 0;} | |
278 int sum=0; | |
279 for(Read r : list){ | |
280 if(hasDifferentAllele(var, r)){sum++;} | |
281 } | |
282 return sum; | |
283 } | |
284 | |
285 static int examineVar(final int var, final ArrayList<Read> list, final IntList collection, final int[] rvector, LinkedHashMap<Integer, ArrayList<Read>> map){ | |
286 collection.clear(); | |
287 | |
288 for(Read r : list){ | |
289 final ReadKey rk=(ReadKey) r.obj; | |
290 final IntList vars=rk.vars; | |
291 | |
292 for(int i=0; i<vars.size; i++){ | |
293 final int v2=vars.get(i); | |
294 if(v2!=var){ | |
295 collection.add(v2); | |
296 } | |
297 } | |
298 } | |
299 collection.sort(); | |
300 | |
301 final int varCount=list.size(); | |
302 | |
303 int lastVar2=-1, bestVar2=-1; | |
304 int sharedCount=0, bestSharedCount=0, bestDifferent=999; | |
305 for(int i=0; i<collection.size; i++){//TODO: Note that not all reads actually cover a given var | |
306 int currentVar2=collection.get(i); | |
307 if(currentVar2==lastVar2){sharedCount++;} | |
308 else{ | |
309 if(sharedCount>bestSharedCount){ | |
310 final int different1=(sharedCount==varCount ? 0 : countDifferentAlleles(lastVar2, list)); | |
311 if(different1*8<varCount){ | |
312 ArrayList<Read> list2=map.get(lastVar2); | |
313 final int varCount2=(list2==null ? 0 : list2.size()); | |
314 final int different2=(sharedCount==varCount2 ? 0 : countDifferentAlleles(var, list2)); | |
315 if(different2*8<varCount2){ | |
316 bestVar2=lastVar2; | |
317 bestSharedCount=sharedCount; | |
318 bestDifferent=Tools.max(different1, different2); | |
319 } | |
320 } | |
321 } | |
322 sharedCount=1; | |
323 } | |
324 lastVar2=currentVar2; | |
325 } | |
326 if(sharedCount>bestSharedCount){ | |
327 final int different1=(sharedCount==varCount ? 0 : countDifferentAlleles(lastVar2, list)); | |
328 if(different1*8<varCount){ | |
329 ArrayList<Read> list2=map.get(lastVar2); | |
330 final int varCount2=(list2==null ? 0 : list2.size()); | |
331 final int different2=(sharedCount==varCount2 ? 0 : countDifferentAlleles(var, list2)); | |
332 if(different2*8<varCount2){ | |
333 bestVar2=lastVar2; | |
334 bestSharedCount=sharedCount; | |
335 bestDifferent=Tools.max(different1, different2); | |
336 } | |
337 } | |
338 } | |
339 rvector[0]=var; | |
340 rvector[1]=list.size(); | |
341 rvector[2]=bestVar2; | |
342 rvector[3]=sharedCount; | |
343 rvector[4]=bestDifferent; | |
344 | |
345 return bestVar2; | |
346 } | |
347 | |
348 static final int toReadLocation(final int clumpLocation, final int maxLeft, final int readPos){ | |
349 final int readLocation=clumpLocation+readPos-maxLeft; | |
350 return readLocation; | |
351 } | |
352 | |
353 static final int toClumpLocation(final int readLocation, final int maxLeft, final int readPos){ | |
354 final int clumpLocation=readLocation-readPos+maxLeft; | |
355 assert(readLocation==toReadLocation(clumpLocation, maxLeft, readPos)); | |
356 return clumpLocation; | |
357 } | |
358 | |
359 static int findBestPivots(Clump c, boolean findCorrelations, IntList pivots){ | |
360 pivots.clear(); | |
361 final int size=c.size(); | |
362 if(size<minSizeSplit){return -1;} | |
363 | |
364 if(findCorrelations){ | |
365 int x=findBestPivot_Correlated(c, pivots); | |
366 if(x>-1){return x;} | |
367 } | |
368 | |
369 final int[][] bcounts=c.baseCounts(); | |
370 final int[][] qcounts=c.qualityCounts(); | |
371 final float[][] qAvgs=c.qualityAverages(); | |
372 final int width=c.width(); | |
373 | |
374 int bestPosition=-1; | |
375 int bestVar=-1; | |
376 long bestBsecond=0; | |
377 long bestQsecond=0; | |
378 | |
379 // final float bmult=8f, bmult2=15f, qmult=(c.useQuality() ? 1.5f : 100f); | |
380 final boolean useQuality=c.useQuality(); | |
381 final float bmult=20f, bmult2=20f; | |
382 final float qmult=20f, qamult=1.5f; | |
383 | |
384 final int minPivotDepth=Tools.max(4, (int)(minSizeFractionSplit*size)); | |
385 // final int minMinorAllele=Tools.max((conservative ? 1 : 2), (int)(0.25f+size/bmult2)); | |
386 final int minMinorAllele=Tools.max(2, (int)(size/bmult2)); | |
387 final int minMinorAlleleQ=minMinorAllele*10; | |
388 | |
389 // assert(false) : size+", "+minSizeSplit+", "+minPivotDepth+", "+minMinorAllele; | |
390 | |
391 for(int i=0; i<width; i++){ | |
392 final long sum=c.getSumAtPosition(bcounts, i); | |
393 if(sum>=minPivotDepth){ | |
394 final int pmajor=c.getConsensusAtPosition(bcounts, i); | |
395 final int pminor=c.getSecondHighest(bcounts, i); | |
396 if(pmajor>=0){ | |
397 final long bmajor=bcounts[pmajor][i]; | |
398 final long bsecond=bcounts[pminor][i]; | |
399 | |
400 final long qmajor=qcounts[pmajor][i]; | |
401 final long qsecond=qcounts[pminor][i]; | |
402 | |
403 final float qamajor=qAvgs[pmajor][i]; | |
404 final float qasecond=qAvgs[pminor][i]; | |
405 | |
406 if(bsecond*bmult>=bmajor && bsecond>=minMinorAllele){ | |
407 if(!useQuality || (qsecond*qmult>=qmajor && qasecond*qamult>=qamajor && qsecond>=minMinorAlleleQ)){//candidate | |
408 if(bsecond>bestBsecond || (bsecond==bestBsecond && qsecond>bestQsecond)){//Perhaps Qsecond should be more important...? | |
409 | |
410 // assert(false) : size+", "+minSizeSplit+", "+minPivotDepth+", "+minMinorAllele+", "+bmajor+", "+bsecond+", "+qmajor+", "+qsecond+", "+minMinorAlleleQ; | |
411 | |
412 bestBsecond=bsecond; | |
413 bestQsecond=qsecond; | |
414 bestPosition=i; | |
415 bestVar=(bestPosition<<2)|pminor; | |
416 } | |
417 } | |
418 } | |
419 } | |
420 } | |
421 } | |
422 | |
423 // if(bestVar<0 && findCorrelations){ | |
424 // bestVar=findBestPivot_Correlated(c); | |
425 // } | |
426 | |
427 if(bestVar>=0){pivots.add(bestVar);} | |
428 return bestVar; | |
429 } | |
430 | |
431 static int minSizeSplit=4; //5 is actually better than 4 in allele separation tests...? | |
432 static float minSizeFractionSplit=0.17f; //0.2 is substantially worse, 0.14 is a tiny bit better than 0.17 | |
433 static boolean conservative=false; | |
434 | |
435 private static final int alleleMask=0x3; | |
436 private static final int posMask=~alleleMask; | |
437 private static final int shift=2; | |
438 | |
439 static boolean FIND_CORRELATIONS=true; | |
440 static int MAX_CORRELATIONS=12; | |
441 | |
442 // private static final ThreadLocal<LinkedHashMap<Integer, ArrayList<Read>>> localMap=new ThreadLocal<LinkedHashMap<Integer, ArrayList<Read>>>(); | |
443 // private static final Integer[] mapKeys=new Integer[2000]; | |
444 // static{ | |
445 // for(int i=0; i<mapKeys.length; i++){mapKeys[i]=i;} | |
446 // } | |
447 | |
448 } |