comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerFlat.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package aligner;
2
3 import dna.AminoAcid;
4 import shared.KillSwitch;
5 import shared.Tools;
6
7 /**
8 * Based on MSA9PBA, but reduced to a single matrix. */
9 public final class SingleStateAlignerFlat implements Aligner {
10
11
12 public SingleStateAlignerFlat(){}
13
14 private void prefillTopRow(){
15 final int[] header=packed[0];
16 final int qlen=rows;
17 for(int i=0; i<=columns; i++){
18 int x=columns-i+1;
19 int qbases=qlen-x;
20
21 //Minimal points to prefer a leftmost alignment
22 header[i]=qbases<=0 ? 0 : ((-qbases)<<STARTOFFSET);
23
24 //Forces consumption of query, but does not allow for insertions...
25 // header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases);
26 }
27 }
28
29 private void prefillLeftColumnStartingAt(int i){
30 packed[0][0]=MODE_MATCH;
31 i=Tools.max(1, i);
32 for(int score=MODE_INS+(POINTSoff_INS*i); i<=maxRows; i++){//Fill column 0 with insertions
33 score+=POINTSoff_INS;
34 packed[i][0]=score;
35 }
36 }
37
38 // private void clearInner(){//NotmaxRows needed
39 // for(int i=1; i<=maxRows; i++){
40 // for(int j=1; j<=maxColumns; j++){
41 // packed[i][j]=0;
42 // }
43 // }
44 // }
45 //
46 // private void clearAll(){//NotmaxRows needed
47 // for(int i=0; i<=maxRows; i++){
48 // for(int j=0; j<=maxColumns; j++){
49 // packed[i][j]=0;
50 // }
51 // }
52 // }
53
54 private void initialize(int rows_, int columns_){
55 rows=rows_;
56 columns=columns_;
57 if(rows<=maxRows && columns<=maxColumns){
58 prefillTopRow();
59 // prefillLeftColumn();
60 return;
61 }
62
63 final int maxRows0=maxRows;
64 final int maxColumns0=maxColumns;
65 final int[][] packed0=packed;
66
67 //Monotonic increase
68 maxRows=Tools.max(maxRows, rows+10);
69 maxColumns=Tools.max(maxColumns, columns+10);
70
71 if(packed==null || maxColumns>maxColumns0){//Make a new matrix
72 packed=KillSwitch.allocInt2D(maxRows+1, maxColumns+1);
73 prefillLeftColumnStartingAt(1);
74 }else{//Copy old rows
75 assert(maxRows0>0 && maxColumns0>0);
76 assert(maxRows>maxRows0 && maxColumns<=maxColumns0);
77 packed=KillSwitch.allocInt2D(maxRows+1);
78 for(int i=0; i<packed.length; i++){
79 if(i<packed0.length){
80 packed[i]=packed0[i];
81 }else{
82 packed[i]=KillSwitch.allocInt1D(maxColumns+1);
83 }
84 }
85 //Fill column 0 with insertions
86 prefillLeftColumnStartingAt(maxRows0);
87 }
88 prefillTopRow();
89 }
90
91 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
92 * Will not fill areas that cannot match minScore */
93 @Override
94 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
95 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore);
96 }
97
98 @Override
99 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
100 return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999);
101 }
102
103 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
104 * Min score is optional */
105 @Override
106 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
107 initialize(read.length, refEndLoc-refStartLoc+1);
108
109 //temporary, for finding a bug
110 if(rows>maxRows || columns>maxColumns){
111 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
112 }
113
114 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
115 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
116
117 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
118 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
119
120 final int refOffset=refStartLoc-1;
121 for(int row=1; row<=rows; row++){
122
123 final byte qBase=read[row-1];
124 for(int col=1; col<=columns; col++){
125
126 final byte rBase=ref[refOffset+col];
127
128 final boolean match=(qBase==rBase);
129 final boolean defined=(qBase!='N' && rBase!='N');
130
131 final int valueFromDiag=packed[row-1][col-1];
132 final int valueFromDel=packed[row][col-1];
133 final int valueFromIns=packed[row-1][col];
134 // final int stateFromDiag=valueFromDiag&MODEMASK;
135 final int scoreFromDiag=valueFromDiag&HIGHMASK;
136 // final int stateFromDel=valueFromDel&MODEMASK;
137 final int scoreFromDel=valueFromDel&HIGHMASK;
138 // final int stateFromIns=valueFromIns&MODEMASK;
139 final int scoreFromIns=valueFromIns&HIGHMASK;
140
141 // final boolean prevMatch=stateFromDiag==MODE_MATCH;
142 // final boolean prevSub=stateFromDiag==MODE_SUB;
143 // final boolean prevDel=stateFromDel==MODE_DEL;
144 // final boolean prevIns=stateFromIns==MODE_INS;
145
146 //Old conditional code, replaced by faster arrays
147 // final int diagScoreM=MODE_MATCH|(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
148 // final int diagScoreS=MODE_SUB|(prevSub ? POINTSoff_SUB2 : POINTSoff_SUB);
149 // final int delScore=scoreFromDel+(prevDel ? POINTSoff_DEL2 : POINTSoff_DEL)|MODE_DEL;
150 // final int insScore=scoreFromIns+(prevIns ? POINTSoff_INS2 : POINTSoff_INS)|MODE_INS;
151
152 final int diagScoreM=POINTSoff_MATCH|MODE_MATCH;
153 final int diagScoreS=POINTSoff_SUB|MODE_SUB;
154 final int delScore=scoreFromDel+POINTSoff_DEL|MODE_DEL;
155 final int insScore=scoreFromIns+POINTSoff_INS|MODE_INS;
156
157 int diagScore=(match ? diagScoreM : diagScoreS);
158 diagScore=scoreFromDiag+(defined ? diagScore : POINTSoff_NOREF_MODE_SUB);
159
160 int score=diagScore>=delScore ? diagScore : delScore;
161 score=score>=insScore ? score : insScore;
162
163 packed[row][col]=score;
164 }
165 // iterationsUnlimited+=columns;
166 }
167
168
169 int maxCol=-1;
170 int maxState=-1;
171 int maxStart=-1;
172 int maxScore=Integer.MIN_VALUE;
173
174 for(int col=1; col<=columns; col++){
175 int x=packed[rows][col];
176 if((x&SCOREMASK)>maxScore){
177 maxScore=x;
178 maxCol=col;
179 maxState=x&MODEMASK;
180 maxStart=x&STARTMASK;
181 }
182 }
183 maxScore>>=SCOREOFFSET;
184
185 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore);
186 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart};
187 }
188
189 /** Generates the match string */
190 @Override
191 public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
192 // assert(false);
193 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
194 assert(row==rows);
195
196 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
197 int outPos=0;
198
199 // assert(state==(packed[row][col]&MODEMASK));
200
201 while(row>0 && col>0){
202 byte q=query[row-1];
203 byte r=ref[refStartLoc+col-1];
204 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
205 state=(packed[row][col]&MODEMASK);
206 if(state==MODE_MATCH){
207 col--;
208 row--;
209 out[outPos]=defined ? (byte)'m' : (byte)'N';
210 }else if(state==MODE_SUB){
211 col--;
212 row--;
213 out[outPos]=defined ? (byte)'S' : (byte)'N';
214 }else if(state==MODE_N){
215 col--;
216 row--;
217 out[outPos]='N';
218 }else if(state==MODE_DEL){
219 col--;
220 out[outPos]='D';
221 }else if(state==MODE_INS){
222 row--;
223 // out[outPos]='I';
224 if(col>=0 && col<columns){
225 out[outPos]='I';
226 }else{
227 out[outPos]='C';
228 col--;
229 }
230 }else{
231 assert(false) : state;
232 }
233 outPos++;
234 }
235
236 assert(row==0 || col==0);
237 if(col!=row){//Not sure what this is doing
238 while(row>0){
239 out[outPos]='C';
240 outPos++;
241 row--;
242 col--;
243 }
244 if(col>0){
245 //do nothing
246 }
247 }
248
249 //Shrink and reverse the string
250 byte[] out2=new byte[outPos];
251 for(int i=0; i<outPos; i++){
252 out2[i]=out[outPos-i-1];
253 }
254 out=null;
255
256 return out2;
257 }
258
259 @Override
260 /** Generates identity;
261 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
262 public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
263
264 // assert(false);
265 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
266 assert(row==rows);
267
268 // assert(state==(packed[row][col]&MODEMASK));
269 int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
270
271 while(row>0 && col>0){
272 byte q=query[row-1];
273 byte r=ref[refStartLoc+col-1];
274 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
275 state=(packed[row][col]&MODEMASK);
276 if(state==MODE_MATCH){
277 col--;
278 row--;
279 match+=(defined ? 1 : 0);
280 noref+=(defined ? 0 : 1);
281 }else if(state==MODE_SUB){
282 col--;
283 row--;
284 sub+=(defined ? 1 : 0);
285 noref+=(defined ? 0 : 1);
286 }else if(state==MODE_N){//Can't currently happen
287 col--;
288 row--;
289 noref++;
290 }else if(state==MODE_DEL){
291 col--;
292 del++;
293 }else if(state==MODE_INS){
294 row--;
295 boolean edge=(col<=1 || col>=columns);
296 ins+=(edge ? 0 : 1);
297 clip+=(edge ? 1 : 0);
298 }else{
299 assert(false) : state;
300 }
301 }
302
303 assert(row==0 || col==0);
304 if(col!=row){//Not sure what this is doing
305 while(row>0){
306 clip++;
307 row--;
308 col--;
309 }
310 if(col>0){
311 //do nothing
312 }
313 }
314
315 if(extra!=null){
316 assert(extra.length==5);
317 extra[0]=match;
318 extra[1]=sub;
319 extra[2]=del;
320 extra[3]=ins;
321 extra[4]=noref;
322 extra[5]=clip;
323 }
324
325 float len=match+sub+ins+del+noref*0.1f;
326 float id=match/Tools.max(1.0f, len);
327 return id;
328 }
329
330 /** @return {score, bestRefStart, bestRefStop} */
331 @Override
332 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
333 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){
334
335 int row=maxRow;
336 int col=maxCol;
337 int state=maxState;
338
339 assert(maxState>=0 && maxState<packed.length) :
340 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
341 assert(maxRow>=0 && maxRow<packed.length) :
342 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
343 assert(maxCol>=0 && maxCol<packed[0].length) :
344 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
345
346 int score=packed[maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
347
348 if(row<rows){
349 int difR=rows-row;
350 int difC=columns-col;
351
352 while(difR>difC){
353 score+=POINTSoff_NOREF;
354 difR--;
355 }
356
357 row+=difR;
358 col+=difR;
359
360 }
361
362 assert(refStartLoc<=refEndLoc);
363 assert(row==rows);
364
365
366 final int bestRefStop=refStartLoc+col-1;
367
368 while(row>0 && col>0){
369 // final byte c=read[row-1];
370 // final byte r=ref[refStartLoc+col-1];
371 // final boolean defined=(AminoAcid.isFullyDefined(c) && AminoAcid.isFullyDefined(r));
372 state=(packed[row][col]&MODEMASK);
373 if(state==MODE_MATCH){
374 col--;
375 row--;
376 }else if(state==MODE_SUB){
377 col--;
378 row--;
379 }else if(state==MODE_N){
380 col--;
381 row--;
382 }else if(state==MODE_DEL){
383 col--;
384 }else if(state==MODE_INS){
385 row--;
386 }else{
387 assert(false) : state;
388 }
389 }
390 // assert(false) : row+", "+col;
391 if(row>col){
392 col-=row;
393 }
394
395 final int bestRefStart=refStartLoc+col;
396
397 score>>=SCOREOFFSET;
398 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart);
399 int[] rvec;
400 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
401 int padLeft=Tools.max(0, refStartLoc-bestRefStart);
402 int padRight=Tools.max(0, bestRefStop-refEndLoc);
403 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
404 }else{
405 rvec=new int[] {score, bestRefStart, bestRefStop};
406 }
407 return rvec;
408 }
409
410
411 /** Will not fill areas that cannot match minScore.
412 * @return {score, bestRefStart, bestRefStop} */
413 @Override
414 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
415 int a=Tools.max(0, refStartLoc);
416 int b=Tools.min(ref.length-1, refEndLoc);
417 assert(b>=a);
418
419 if(b-a>=maxColumns){
420 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
421 assert(false) : refStartLoc+", "+refEndLoc;
422 b=Tools.min(ref.length-1, a+maxColumns-1);
423 }
424 int[] max=fillLimited(read, ref, a, b, minScore);
425 // return max==null ? null : new int[] {max[3], 0, max[1]};
426
427 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/));
428
429 return score;
430 }
431
432 public static final String toString(byte[] ref, int startLoc, int stopLoc){
433 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
434 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
435 return sb.toString();
436 }
437
438 // public int maxScoreByIdentity(int len, float identity){
439 // assert(identity>=0 && identity<=1);
440 // return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
441 // }
442
443 @Override
444 public int minScoreByIdentity(int len, float identity){
445 assert(identity>=0 && identity<=1);
446
447 int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
448 int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS));
449 int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL));
450 return Tools.min(a, b, c);
451 }
452
453 private static int calcDelScoreOffset(int len){
454 if(len<=0){return 0;}
455 int score=POINTSoff_DEL*len;
456 return score;
457 }
458 //
459 // public static int calcInsScore(int len){
460 // if(len<=0){return 0;}
461 // int score=POINTS_INS;
462 //
463 // if(len>1){
464 // score+=(len-1)*POINTS_INS2;
465 // }
466 // return score;
467 // }
468 //
469 // private static int calcInsScoreOffset(int len){
470 // if(len<=0){return 0;}
471 // int score=POINTSoff_INS;
472 //
473 // if(len>1){
474 // score+=(len-1)*POINTSoff_INS2;
475 // }
476 // return score;
477 // }
478
479 @Override
480 public int rows(){return rows;}
481 @Override
482 public int columns(){return columns;}
483
484
485 private int maxRows;
486 private int maxColumns;
487
488 private int[][] packed;
489
490 public static final int MODEBITS=3;
491 public static final int STARTBITS=9;
492 public static final int LOWBITS=MODEBITS+STARTBITS;
493 public static final int SCOREBITS=32-STARTBITS;
494 public static final int MAX_START=((1<<STARTBITS)-1);
495 public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
496 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
497
498 public static final int STARTOFFSET=MODEBITS;
499 public static final int SCOREOFFSET=LOWBITS;
500
501 public static final int MODEMASK=~((-1)<<MODEBITS);
502 public static final int STARTMASK=(~((-1)<<STARTBITS))<<STARTOFFSET;
503 public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
504 public static final int HIGHMASK=SCOREMASK|STARTMASK;
505
506 //For some reason changing MODE_DEL from 1 to 0 breaks everything
507 private static final byte MODE_DEL=1;
508 private static final byte MODE_INS=2;
509 private static final byte MODE_SUB=3;
510 private static final byte MODE_MATCH=4;
511 private static final byte MODE_N=5;
512
513 public static final int POINTS_NOREF=-15;
514 public static final int POINTS_MATCH=100;
515 public static final int POINTS_SUB=-50;
516 public static final int POINTS_INS=-121;
517 public static final int POINTS_DEL=-111;
518
519 public static final int BAD=MIN_SCORE-1;
520
521
522 public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
523 public static final int POINTSoff_NOREF_MODE_SUB=POINTSoff_NOREF|MODE_SUB;
524 public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
525 public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
526 public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
527 public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
528 public static final int BADoff=(BAD<<SCOREOFFSET);
529
530 private int rows;
531 private int columns;
532
533 // public long iterationsLimited=0;
534 // public long iterationsUnlimited=0;
535
536 public boolean verbose=false;
537 public boolean verbose2=false;
538
539 }