comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerPacBioAdapter.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package aligner;
2
3 import java.util.Arrays;
4
5 import shared.KillSwitch;
6 import shared.Tools;
7
8 /**
9 * Based on MSA9PBA, but reduced to a single matrix. */
10 public final class SingleStateAlignerPacBioAdapter {
11
12
13 public SingleStateAlignerPacBioAdapter(int maxRows_, int maxColumns_, int qlen){
14 // assert(maxColumns_>=200);
15 // assert(maxRows_>=200);
16 maxRows=maxRows_;
17 maxColumns=maxColumns_;
18 packed=new int[maxRows+1][maxColumns+1];
19
20 insScoreArray=KillSwitch.allocInt1D(5);
21 delScoreArray=KillSwitch.allocInt1D(5);
22 matchScoreArray=KillSwitch.allocInt1D(5);
23 subScoreArray=KillSwitch.allocInt1D(5);
24
25 Arrays.fill(insScoreArray, POINTSoff_INS|MODE_INS);
26 Arrays.fill(delScoreArray, POINTSoff_DEL|MODE_DEL);
27 Arrays.fill(matchScoreArray, POINTSoff_MATCH|MODE_MATCH);
28 Arrays.fill(subScoreArray, POINTSoff_SUB|MODE_SUB);
29
30 insScoreArray[MODE_INS]=POINTSoff_INS2|MODE_INS;
31 delScoreArray[MODE_DEL]=POINTSoff_DEL2|MODE_DEL;
32 matchScoreArray[MODE_MATCH]=POINTSoff_MATCH2|MODE_MATCH;
33 subScoreArray[MODE_SUB]=POINTSoff_SUB2|MODE_SUB;
34
35 vertLimit=new int[maxRows+1];
36 horizLimit=new int[maxColumns+1];
37 Arrays.fill(vertLimit, BADoff);
38 Arrays.fill(horizLimit, BADoff);
39
40 // for(int i=0; i<maxColumns+1; i++){
41 // scores[0][i]=0-i;
42 // }
43
44 for(int i=1; i<=maxRows; i++){
45 for(int j=1; j<packed[i].length; j++){
46 packed[i][j]|=BADoff;
47 }
48 }
49 for(int i=0; i<=maxRows; i++){
50
51 int prevScore=(i<2 ? 0 : packed[i-1][0]);
52 int score=(i<2 ? (i*POINTSoff_INS) : prevScore+POINTSoff_INS2);
53
54 packed[i][0]=score;
55 }
56 for(int i=0; i<packed[0].length; i++){
57 packed[0][i]=packed[0][i]|(i<<STARTOFFSET);
58 int x=packed[0].length-i;
59 int qbases=qlen-x;
60 if(qbases>0){
61 packed[0][i]|=calcDelScoreOffset(qbases); //Forces consumption of query
62 }
63 }
64 }
65
66
67 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
68 * Will not fill areas that cannot match minScore */
69 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
70 return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);
71 }
72
73
74 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
75 * Will not fill areas that cannot match minScore */
76 private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
77 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore);
78 }
79
80
81 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
82 * Does not require a min score (ie, same as old method) */
83 private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, final int minScore){
84 rows=read.length;
85 columns=refEndLoc-refStartLoc+1;
86
87 //temporary, for finding a bug
88 if(rows>maxRows || columns>maxColumns){
89 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
90 }
91
92 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
93 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
94
95 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
96 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
97
98 final int refOffset=refStartLoc-1;
99 for(int row=1; row<=rows; row++){
100
101 final byte qBase=read[row-1];
102 for(int col=1; col<=columns; col++){
103 iterationsUnlimited++;
104
105 final byte rBase=ref[refOffset+col];
106
107 final boolean match=(qBase==rBase);//Note that qBase will never be 'N'
108
109 final int valueFromDiag=packed[row-1][col-1];
110 final int valueFromDel=packed[row][col-1];
111 final int valueFromIns=packed[row-1][col];
112 final int stateFromDiag=valueFromDiag&MODEMASK;
113 final int scoreFromDiag=valueFromDiag&HIGHMASK;
114 final int stateFromDel=valueFromDel&MODEMASK;
115 final int scoreFromDel=valueFromDel&HIGHMASK;
116 final int stateFromIns=valueFromIns&MODEMASK;
117 final int scoreFromIns=valueFromIns&HIGHMASK;
118
119 // final boolean prevMatch=stateFromDiag==MODE_MATCH;
120 // final boolean prevSub=stateFromDiag==MODE_SUB;
121 // final boolean prevDel=stateFromDel==MODE_DEL;
122 // final boolean prevIns=stateFromIns==MODE_INS;
123
124 //Old conditional code, replaced by faster arrays
125 // final int diagScoreM=MODE_MATCH|(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
126 // final int diagScoreS=MODE_SUB|(prevSub ? POINTSoff_SUB2 : POINTSoff_SUB);
127 // final int delScore=scoreFromDel+(prevDel ? POINTSoff_DEL2 : POINTSoff_DEL)|MODE_DEL;
128 // final int insScore=scoreFromIns+(prevIns ? POINTSoff_INS2 : POINTSoff_INS)|MODE_INS;
129
130 final int diagScoreM=matchScoreArray[stateFromDiag];
131 final int diagScoreS=subScoreArray[stateFromDiag];
132 final int delScore=scoreFromDel+delScoreArray[stateFromDel];
133 final int insScore=scoreFromIns+insScoreArray[stateFromIns];
134
135 int diagScore=(match ? diagScoreM : diagScoreS);
136 diagScore=scoreFromDiag+(rBase!='N' ? diagScore : POINTSoff_NOREF_MODE_SUB);
137
138 int score=diagScore>=delScore ? diagScore : delScore;
139 score=score>=insScore ? score : insScore;
140
141 packed[row][col]=score;
142 }
143 }
144
145
146 int maxCol=-1;
147 int maxState=-1;
148 int maxStart=-1;
149 int maxScore=Integer.MIN_VALUE;
150
151 for(int col=1; col<=columns; col++){
152 int x=packed[rows][col];
153 if((x&SCOREMASK)>maxScore){
154 maxScore=x;
155 maxCol=col;
156 maxState=x&MODEMASK;
157 maxStart=x&STARTMASK;
158 }
159 }
160 maxScore>>=SCOREOFFSET;
161
162 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore);
163 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart};
164 }
165
166
167
168 /** Generates the match string */
169 public final byte[] traceback(int refStartLoc, int refEndLoc, int row, int col, int state){
170 // assert(false);
171 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
172 assert(row==rows);
173
174 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
175 int outPos=0;
176
177 // assert(state==(packed[row][col]&MODEMASK));
178
179 while(row>0 && col>0){
180 state=(packed[row][col]&MODEMASK);
181 if(state==MODE_MATCH){
182 col--;
183 row--;
184 out[outPos]='m';
185 }else if(state==MODE_SUB){
186 col--;
187 row--;
188 out[outPos]='S';
189 }else if(state==MODE_DEL){
190 col--;
191 out[outPos]='D';
192 }else if(state==MODE_INS){
193 row--;
194 out[outPos]='I';
195 }else{
196 assert(false) : state;
197 }
198 outPos++;
199 }
200
201 assert(row==0 || col==0);
202 if(col!=row){
203 while(row>0){
204 out[outPos]='X';
205 outPos++;
206 row--;
207 col--;
208 }
209 if(col>0){
210 //do nothing
211 }
212 }
213
214 //Shrink and reverse the string
215 byte[] out2=new byte[outPos];
216 for(int i=0; i<outPos; i++){
217 out2[i]=out[outPos-i-1];
218 }
219 out=null;
220
221 return out2;
222 }
223
224 /** @return {score, bestRefStart, bestRefStop} */
225 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
226 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){
227
228 int row=maxRow;
229 int col=maxCol;
230 int state=maxState;
231
232 assert(maxState>=0 && maxState<packed.length) :
233 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
234 assert(maxRow>=0 && maxRow<packed.length) :
235 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
236 assert(maxCol>=0 && maxCol<packed[0].length) :
237 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
238
239 int score=packed[maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
240
241 if(row<rows){
242 int difR=rows-row;
243 int difC=columns-col;
244
245 while(difR>difC){
246 score+=POINTSoff_NOREF;
247 difR--;
248 }
249
250 row+=difR;
251 col+=difR;
252
253 }
254
255 assert(refStartLoc<=refEndLoc);
256 assert(row==rows);
257
258
259 final int bestRefStop=refStartLoc+col-1;
260
261 while(row>0 && col>0){
262 state=(packed[row][col]&MODEMASK);
263 if(state==MODE_MATCH){
264 col--;
265 row--;
266 }else if(state==MODE_SUB){
267 col--;
268 row--;
269 }else if(state==MODE_DEL){
270 col--;
271 }else if(state==MODE_INS){
272 row--;
273 }else{
274 assert(false) : state;
275 }
276 }
277 // assert(false) : row+", "+col;
278 if(row>col){
279 col-=row;
280 }
281
282 final int bestRefStart=refStartLoc+col;
283
284 score>>=SCOREOFFSET;
285 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart);
286 int[] rvec;
287 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
288 int padLeft=Tools.max(0, refStartLoc-bestRefStart);
289 int padRight=Tools.max(0, bestRefStop-refEndLoc);
290 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
291 }else{
292 rvec=new int[] {score, bestRefStart, bestRefStop};
293 }
294 return rvec;
295 }
296
297
298 /** Will not fill areas that cannot match minScore.
299 * @return {score, bestRefStart, bestRefStop} */
300 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
301 int a=Tools.max(0, refStartLoc);
302 int b=Tools.min(ref.length-1, refEndLoc);
303 assert(b>=a);
304
305 if(b-a>=maxColumns){
306 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
307 assert(false) : refStartLoc+", "+refEndLoc;
308 b=Tools.min(ref.length-1, a+maxColumns-1);
309 }
310 int[] max=fillLimited(read, ref, a, b, minScore);
311 // return max==null ? null : new int[] {max[3], 0, max[1]};
312
313 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/));
314
315 return score;
316 }
317
318 public static final String toString(byte[] ref, int startLoc, int stopLoc){
319 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
320 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
321 return sb.toString();
322 }
323
324 public static int calcDelScore(int len){
325 if(len<=0){return 0;}
326 int score=POINTS_DEL;
327 if(len>1){
328 score+=(len-1)*POINTS_DEL2;
329 }
330 return score;
331 }
332
333 private static int calcDelScoreOffset(int len){
334 if(len<=0){return 0;}
335 int score=POINTSoff_DEL;
336 if(len>1){
337 score+=(len-1)*POINTSoff_DEL2;
338 }
339 return score;
340 }
341
342 public static int calcInsScore(int len){
343 if(len<=0){return 0;}
344 int score=POINTS_INS;
345
346 if(len>1){
347 score+=(len-1)*POINTS_INS2;
348 }
349 return score;
350 }
351
352 private static int calcInsScoreOffset(int len){
353 if(len<=0){return 0;}
354 int score=POINTSoff_INS;
355
356 if(len>1){
357 score+=(len-1)*POINTSoff_INS2;
358 }
359 return score;
360 }
361
362
363 private final int maxRows;
364 public final int maxColumns;
365
366 private final int[][] packed;
367
368 private final int[] vertLimit;
369 private final int[] horizLimit;
370
371 private final int[] insScoreArray;
372 private final int[] delScoreArray;
373 private final int[] matchScoreArray;
374 private final int[] subScoreArray;
375
376 public static final int MODEBITS=3;
377 public static final int STARTBITS=9;
378 public static final int LOWBITS=MODEBITS+STARTBITS;
379 public static final int SCOREBITS=32-STARTBITS;
380 public static final int MAX_START=((1<<STARTBITS)-1);
381 public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
382 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
383
384 // public static final int MODEOFFSET=0; //Always zero.
385 public static final int STARTOFFSET=MODEBITS;
386 public static final int SCOREOFFSET=LOWBITS;
387
388 public static final int MODEMASK=~((-1)<<MODEBITS);
389 public static final int STARTMASK=(~((-1)<<STARTBITS))<<STARTOFFSET;
390 public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
391 public static final int HIGHMASK=SCOREMASK|STARTMASK;
392
393 //For some reason changing MODE_DEL from 1 to 0 breaks everything
394 private static final byte MODE_DEL=1;
395 private static final byte MODE_INS=2;
396 private static final byte MODE_SUB=3;
397 private static final byte MODE_MATCH=4;
398
399 public static final int POINTS_NOREF=-10;
400 public static final int POINTS_MATCH=90;
401 public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
402 public static final int POINTS_SUB=-143;
403 public static final int POINTS_SUB2=-54;
404 public static final int POINTS_INS=-207;
405 public static final int POINTS_INS2=-51;
406 public static final int POINTS_INS3=-37;
407 public static final int POINTS_DEL=-273;
408 public static final int POINTS_DEL2=-38;
409
410
411 // public static final int POINTS_NOREF=-10;
412 // public static final int POINTS_NOCALL=-10;
413 // public static final int POINTS_MATCH=90;
414 // public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
415 // public static final int POINTS_SUB=-143;
416 // public static final int POINTS_SUB2=-54;
417 // public static final int POINTS_INS=-207;
418 // public static final int POINTS_INS2=-51;
419 // public static final int POINTS_INS3=-37;
420 // public static final int POINTS_DEL=-273;
421 // public static final int POINTS_DEL2=-38;
422
423 public static final int BAD=MIN_SCORE-1;
424
425
426 public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
427 public static final int POINTSoff_NOREF_MODE_SUB=POINTSoff_NOREF|MODE_SUB;
428 public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
429 public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
430 public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
431 public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
432 public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
433 public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
434 public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
435 public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
436 public static final int BADoff=(BAD<<SCOREOFFSET);
437 public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
438 public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
439
440 private int rows;
441 private int columns;
442
443 public long iterationsLimited=0;
444 public long iterationsUnlimited=0;
445
446 public boolean verbose=false;
447 public boolean verbose2=false;
448
449 }