comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Splitter.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package clump;
2
3 import java.util.ArrayList;
4 import java.util.LinkedHashMap;
5 import java.util.Map.Entry;
6
7 import dna.AminoAcid;
8 import shared.KillSwitch;
9 import shared.Tools;
10 import stream.Read;
11 import structures.IntList;
12 import structures.LongList;
13
14 /**
15 * A tool for splitting clumps by allele.
16 * @author Brian Bushnell
17 * @date September 26, 2016
18 *
19 */
20 class Splitter {
21
22 static ArrayList<Clump> splitOnPivot(Clump c){
23 ArrayList<Clump> list=new ArrayList<Clump>(3);
24 list.add(c);
25 if(c.size()<minSizeSplit){
26 // assert(findBestPivot(c)<0);
27 return list;
28 }
29 return splitOnPivot(list);
30 }
31
32 static ArrayList<Clump> splitOnPivot(ArrayList<Clump> list){
33 ArrayList<Clump> out=new ArrayList<Clump>();
34
35 final IntList pivots=new IntList(2);
36 for(int i=0; i<list.size(); i++){
37 Clump clump=list.get(i);
38 list.set(i, null);
39 int pivot=findBestPivots(clump, FIND_CORRELATIONS, pivots);
40 if(pivot<0){
41 assert(pivots.size==0);
42 out.add(clump);
43 }else{
44 assert(pivots.size==1 || pivots.size==2) : pivot+", "+pivots.size+", "+pivots;
45 assert(pivots.get(0)==pivot);
46 splitAndAdd(clump, pivots.get(0), (pivots.size>1 ? pivots.get(1) : -1), list);
47 }
48 }
49 return out;
50 }
51
52 static void splitAndAdd(Clump c, final int var1, final int var2, ArrayList<Clump> out) {
53 final int maxLeft=c.maxLeft();
54
55 final Clump major=Clump.makeClump(c.kmer), minor=Clump.makeClump(c.kmer);
56
57 for(Read r : c){
58 if(containsVar(var1, r, maxLeft) || (var2>=0 && containsVar(var2, r, maxLeft))){
59 minor.add(r);
60 }else{
61 major.add(r);
62 }
63 }
64 // System.err.println(major.size()+"\t"+minor.size());
65 out.add(major);
66 out.add(minor);
67 }
68
69 //Returns the c
70 static int countVariants(Clump c, LongList varCounts){
71 varCounts.clear();
72 final int[][] bcounts=c.baseCounts();
73 final int[][] qcounts=c.qualityCounts();
74 final int len=bcounts[0].length;
75 for(int i=0; i<len; i++){
76 final int major=c.getConsensusAtPosition(qcounts, i);
77 for(int x=0; x<4; x++){
78 final int bcount=bcounts[x][i];
79 if(bcount>1 && x!=major){
80 final long var=(((long)bcount)<<32)|((i<<shift)|x);
81 varCounts.add(var);
82 }
83 }
84 }
85 if(varCounts.size()<1){return 0;}
86 varCounts.sort();
87 varCounts.reverse();
88 return (int)(varCounts.get(0)>>>32);
89 }
90
91 static LinkedHashMap<Integer, ArrayList<Read>> findReadVariants(Clump c, boolean makeMap){
92 if(c.size()<5){return null;} //Not really needed with tiny clumps
93 // assert(c.size()>3); //TODO
94 LinkedHashMap<Integer, ArrayList<Read>> map=null;
95 map=(makeMap ? new LinkedHashMap<Integer, ArrayList<Read>>() : null);
96 // if(makeMap){
97 // map=localMap.get();
98 // if(map==null){
99 // map=new LinkedHashMap<Integer, ArrayList<Read>>();
100 // localMap.set(map);
101 // }
102 // map.clear();
103 // }
104
105 final int[][] bcounts=c.baseCounts();
106 final Read ref=c.consensusRead();
107 final byte[] rbases=ref.bases;
108 for(Read r : c){
109 final byte[] bases=r.bases;
110 final ReadKey key=(ReadKey) r.obj;
111 IntList list=key.vars;
112 if(list!=null){list.clear();}
113
114
115 final int cStart=0, rStart=c.maxLeft()-key.position;
116
117 for(int i=cStart, j=rStart; i<bases.length; i++, j++){
118 final byte cb=bases[i], rb=rbases[j];
119 if(cb!=rb){
120 byte x=AminoAcid.baseToNumber[cb];
121 if(x>=0){
122 int count=bcounts[x][j];
123 if(count>1){
124 int var=(j<<2)|x;
125 if(list==null){list=key.vars=new IntList(4);}
126 list.add(var);
127
128 if(map!=null){
129 Integer mapkey=var;//mapKeys[var];
130 ArrayList<Read> alr=map.get(mapkey);
131 if(alr==null){
132 alr=new ArrayList<Read>(4);
133 map.put(mapkey, alr);
134 }
135 alr.add(r);
136 }
137
138 }
139 }
140 }
141 }
142 }
143 return map==null || map.isEmpty() ? null : map;
144 }
145
146 static int findBestPivot_Correlated(Clump c, IntList pivots){
147 assert(pivots.size==0);
148 LinkedHashMap<Integer, ArrayList<Read>> map=findReadVariants(c, true);
149 if(map==null){return -1;}
150
151 IntList collection=new IntList(32);
152 int[] rvector=KillSwitch.allocInt1D(5);
153
154 int bestVar=-1;
155 int bestVarCount=-1;
156 int bestVar2=-1;
157 int bestVar2Count=-1;
158 int bestDifferent=-1;
159 float bestCorrelation=-1;
160 float bestScore=-1;
161
162 final float minCorrelation=0.75f;
163 final int minVar2Count=2;
164
165 int max=0;
166 for(Entry<Integer, ArrayList<Read>> entry : map.entrySet()){
167 max=Tools.max(max, entry.getValue().size());
168 }
169 if(max<2){return -1;}
170 final int thresh=Tools.max(2, max/2);
171 final int thresh2=max/4;
172 int numOverThresh=0;
173 ArrayList<Integer> remove=new ArrayList<Integer>();
174 for(Entry<Integer, ArrayList<Read>> entry : map.entrySet()){
175 int x=entry.getValue().size();
176 if(x>=thresh){numOverThresh++;}
177 else if(x<thresh2){remove.add(entry.getKey());}
178 }
179 for(Integer key : remove){map.remove(key);}
180 if(numOverThresh>MAX_CORRELATIONS){return -1;}
181
182 for(Entry<Integer, ArrayList<Read>> entry : map.entrySet()){
183 final ArrayList<Read> rlist=entry.getValue();
184 final Integer key=entry.getKey();
185 final int var=key;
186 if(rlist.size()>=thresh){
187
188 final int var2=examineVar(var, rlist, collection, rvector, map);
189
190 if(var2>=0){
191
192 final int varCount=rvector[1];
193 final int var2Count=rvector[3];
194 final int different=rvector[4];
195
196 final int var2reads;
197 final int var2ReadsWithoutVar;
198 {
199 ArrayList<Read> temp=map.get(var2);
200 var2reads=(temp==null ? 0 : temp.size());
201
202 // if(var2reads==var2Count){
203 // var2ReadsWithoutVar=0;
204 // }else{
205 // var2ReadsWithoutVar=countDifferentAlleles(var, temp);
206 // }
207
208 var2ReadsWithoutVar=var2reads-varCount;
209
210 }
211
212 // final float correlation=(var2Count-0.05f)/(float)varCount;
213 // final float correlation=(varCount-different)/(float)varCount;
214 final float correlation=(Tools.max(varCount, var2reads)-different)/(float)Tools.max(varCount, var2reads);
215 final int distance=Tools.absdif(var>>2, var2>>2);
216
217 // final float score=correlation*((var2Count-1)-0.5f*var2ReadsWithoutVar-different+1.0f*(varCount));
218
219 final float score=correlation*(var2reads/*var2Count*/+varCount-different+0.5f*var2ReadsWithoutVar)*(distance+250);
220
221 // final float score=correlation*((var2Count-1)-1.0f*var2ReadsWithoutVar+1.0f*(varCount));
222 if(correlation>=minCorrelation && var2Count>=minVar2Count){
223 if(score>bestScore || (score==bestScore && varCount>bestVarCount)){
224 bestVar=var;
225 bestVarCount=varCount;
226 bestVar2=var2;
227 bestVar2Count=var2Count;
228 bestCorrelation=correlation;
229 bestScore=score;
230 bestDifferent=different;
231 }
232 }
233 }
234 }
235 }
236
237 if(bestVar2Count>=minVar2Count && bestCorrelation>=minCorrelation){
238 pivots.add(bestVar);
239 pivots.add(bestVar2);
240 return bestVar;
241 }
242 return -1;
243 }
244
245 static boolean containsVar(final int var, final Read r, final int maxLeft){
246 final int varPos=var>>2;
247 final int varAllele=var&alleleMask;
248 final ReadKey rk=(ReadKey) r.obj;
249 final int rloc=toReadLocation(varPos, maxLeft, rk.position);
250 if(rloc<0 || rloc>=r.length()){
251 return false;
252 }
253 final int readAllele=AminoAcid.baseToNumber[r.bases[rloc]];
254 return readAllele==varAllele;
255 }
256
257 static boolean hasDifferentAllele(final int var, final Read r/*, final Clump c*/){
258 final int varPos=var>>2;
259 final int varAllele=var&alleleMask;
260 final ReadKey rk=(ReadKey) r.obj;
261 final IntList vars=rk.vars;
262 final Clump c=rk.clump;
263 assert(c==rk.clump);
264 final int maxLeft=c.maxLeft();
265 final int rloc=toReadLocation(varPos, maxLeft, rk.position);
266 if(rloc<0 || rloc>=r.length()){
267 assert(!vars.contains(var));
268 return false;
269 }
270 final int readAllele=AminoAcid.baseToNumber[r.bases[rloc]];
271 assert((readAllele==varAllele)==vars.contains(var));
272
273 return readAllele!=varAllele;
274 }
275
276 static int countDifferentAlleles(final int var, ArrayList<Read> list){
277 if(list==null){return 0;}
278 int sum=0;
279 for(Read r : list){
280 if(hasDifferentAllele(var, r)){sum++;}
281 }
282 return sum;
283 }
284
285 static int examineVar(final int var, final ArrayList<Read> list, final IntList collection, final int[] rvector, LinkedHashMap<Integer, ArrayList<Read>> map){
286 collection.clear();
287
288 for(Read r : list){
289 final ReadKey rk=(ReadKey) r.obj;
290 final IntList vars=rk.vars;
291
292 for(int i=0; i<vars.size; i++){
293 final int v2=vars.get(i);
294 if(v2!=var){
295 collection.add(v2);
296 }
297 }
298 }
299 collection.sort();
300
301 final int varCount=list.size();
302
303 int lastVar2=-1, bestVar2=-1;
304 int sharedCount=0, bestSharedCount=0, bestDifferent=999;
305 for(int i=0; i<collection.size; i++){//TODO: Note that not all reads actually cover a given var
306 int currentVar2=collection.get(i);
307 if(currentVar2==lastVar2){sharedCount++;}
308 else{
309 if(sharedCount>bestSharedCount){
310 final int different1=(sharedCount==varCount ? 0 : countDifferentAlleles(lastVar2, list));
311 if(different1*8<varCount){
312 ArrayList<Read> list2=map.get(lastVar2);
313 final int varCount2=(list2==null ? 0 : list2.size());
314 final int different2=(sharedCount==varCount2 ? 0 : countDifferentAlleles(var, list2));
315 if(different2*8<varCount2){
316 bestVar2=lastVar2;
317 bestSharedCount=sharedCount;
318 bestDifferent=Tools.max(different1, different2);
319 }
320 }
321 }
322 sharedCount=1;
323 }
324 lastVar2=currentVar2;
325 }
326 if(sharedCount>bestSharedCount){
327 final int different1=(sharedCount==varCount ? 0 : countDifferentAlleles(lastVar2, list));
328 if(different1*8<varCount){
329 ArrayList<Read> list2=map.get(lastVar2);
330 final int varCount2=(list2==null ? 0 : list2.size());
331 final int different2=(sharedCount==varCount2 ? 0 : countDifferentAlleles(var, list2));
332 if(different2*8<varCount2){
333 bestVar2=lastVar2;
334 bestSharedCount=sharedCount;
335 bestDifferent=Tools.max(different1, different2);
336 }
337 }
338 }
339 rvector[0]=var;
340 rvector[1]=list.size();
341 rvector[2]=bestVar2;
342 rvector[3]=sharedCount;
343 rvector[4]=bestDifferent;
344
345 return bestVar2;
346 }
347
348 static final int toReadLocation(final int clumpLocation, final int maxLeft, final int readPos){
349 final int readLocation=clumpLocation+readPos-maxLeft;
350 return readLocation;
351 }
352
353 static final int toClumpLocation(final int readLocation, final int maxLeft, final int readPos){
354 final int clumpLocation=readLocation-readPos+maxLeft;
355 assert(readLocation==toReadLocation(clumpLocation, maxLeft, readPos));
356 return clumpLocation;
357 }
358
359 static int findBestPivots(Clump c, boolean findCorrelations, IntList pivots){
360 pivots.clear();
361 final int size=c.size();
362 if(size<minSizeSplit){return -1;}
363
364 if(findCorrelations){
365 int x=findBestPivot_Correlated(c, pivots);
366 if(x>-1){return x;}
367 }
368
369 final int[][] bcounts=c.baseCounts();
370 final int[][] qcounts=c.qualityCounts();
371 final float[][] qAvgs=c.qualityAverages();
372 final int width=c.width();
373
374 int bestPosition=-1;
375 int bestVar=-1;
376 long bestBsecond=0;
377 long bestQsecond=0;
378
379 // final float bmult=8f, bmult2=15f, qmult=(c.useQuality() ? 1.5f : 100f);
380 final boolean useQuality=c.useQuality();
381 final float bmult=20f, bmult2=20f;
382 final float qmult=20f, qamult=1.5f;
383
384 final int minPivotDepth=Tools.max(4, (int)(minSizeFractionSplit*size));
385 // final int minMinorAllele=Tools.max((conservative ? 1 : 2), (int)(0.25f+size/bmult2));
386 final int minMinorAllele=Tools.max(2, (int)(size/bmult2));
387 final int minMinorAlleleQ=minMinorAllele*10;
388
389 // assert(false) : size+", "+minSizeSplit+", "+minPivotDepth+", "+minMinorAllele;
390
391 for(int i=0; i<width; i++){
392 final long sum=c.getSumAtPosition(bcounts, i);
393 if(sum>=minPivotDepth){
394 final int pmajor=c.getConsensusAtPosition(bcounts, i);
395 final int pminor=c.getSecondHighest(bcounts, i);
396 if(pmajor>=0){
397 final long bmajor=bcounts[pmajor][i];
398 final long bsecond=bcounts[pminor][i];
399
400 final long qmajor=qcounts[pmajor][i];
401 final long qsecond=qcounts[pminor][i];
402
403 final float qamajor=qAvgs[pmajor][i];
404 final float qasecond=qAvgs[pminor][i];
405
406 if(bsecond*bmult>=bmajor && bsecond>=minMinorAllele){
407 if(!useQuality || (qsecond*qmult>=qmajor && qasecond*qamult>=qamajor && qsecond>=minMinorAlleleQ)){//candidate
408 if(bsecond>bestBsecond || (bsecond==bestBsecond && qsecond>bestQsecond)){//Perhaps Qsecond should be more important...?
409
410 // assert(false) : size+", "+minSizeSplit+", "+minPivotDepth+", "+minMinorAllele+", "+bmajor+", "+bsecond+", "+qmajor+", "+qsecond+", "+minMinorAlleleQ;
411
412 bestBsecond=bsecond;
413 bestQsecond=qsecond;
414 bestPosition=i;
415 bestVar=(bestPosition<<2)|pminor;
416 }
417 }
418 }
419 }
420 }
421 }
422
423 // if(bestVar<0 && findCorrelations){
424 // bestVar=findBestPivot_Correlated(c);
425 // }
426
427 if(bestVar>=0){pivots.add(bestVar);}
428 return bestVar;
429 }
430
431 static int minSizeSplit=4; //5 is actually better than 4 in allele separation tests...?
432 static float minSizeFractionSplit=0.17f; //0.2 is substantially worse, 0.14 is a tiny bit better than 0.17
433 static boolean conservative=false;
434
435 private static final int alleleMask=0x3;
436 private static final int posMask=~alleleMask;
437 private static final int shift=2;
438
439 static boolean FIND_CORRELATIONS=true;
440 static int MAX_CORRELATIONS=12;
441
442 // private static final ThreadLocal<LinkedHashMap<Integer, ArrayList<Read>>> localMap=new ThreadLocal<LinkedHashMap<Integer, ArrayList<Read>>>();
443 // private static final Integer[] mapKeys=new Integer[2000];
444 // static{
445 // for(int i=0; i<mapKeys.length; i++){mapKeys[i]=i;}
446 // }
447
448 }