comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerFlatFloat.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package aligner;
2
3 import dna.AminoAcid;
4 import shared.KillSwitch;
5 import shared.Tools;
6
7 /**
8 * Based on SSAFlat, but with previous state pointers removed. */
9 public final class SingleStateAlignerFlatFloat implements Aligner {
10
11
12 public SingleStateAlignerFlatFloat(){}
13
14 private void prefillTopRow(){
15 final float[] header=packed[0];
16 final int qlen=rows;
17 for(int i=0; i<=columns; i++){
18 int x=columns-i+1;
19 int qbases=qlen-x;
20
21 //Minimal points to prefer a leftmost alignment
22 header[i]=qbases<=0 ? 0 : -qbases;
23
24 //Forces consumption of query, but does not allow for insertions...
25 // header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases);
26 }
27 }
28
29 private void prefillLeftColumnStartingAt(int i){
30 packed[0][0]=MODE_MATCH;
31 i=Tools.max(1, i);
32 for(float score=MODE_INS+(POINTS_INS*i); i<=maxRows; i++){//Fill column 0 with insertions
33 score+=POINTS_INS;
34 packed[i][0]=score;
35 }
36 }
37
38 private void initialize(int rows_, int columns_){
39 rows=rows_;
40 columns=columns_;
41 if(rows<=maxRows && columns<=maxColumns){
42 prefillTopRow();
43 // prefillLeftColumn();
44 return;
45 }
46
47 final int maxRows0=maxRows;
48 final int maxColumns0=maxColumns;
49 final float[][] packed0=packed;
50
51 //Monotonic increase
52 maxRows=Tools.max(maxRows, rows+10);
53 maxColumns=Tools.max(maxColumns, columns+10);
54
55 if(packed==null || maxColumns>maxColumns0){//Make a new matrix
56 packed=KillSwitch.allocFloat2D(maxRows+1, maxColumns+1);
57 prefillLeftColumnStartingAt(1);
58 }else{//Copy old rows
59 assert(maxRows0>0 && maxColumns0>0);
60 assert(maxRows>maxRows0 && maxColumns<=maxColumns0) : "rows="+rows+",maxRows="+maxRows+
61 ",maxRows0="+maxRows0+",columns="+columns+",maxColumns="+maxColumns+",maxColumns0="+maxColumns0;
62 packed=KillSwitch.allocFloat2D(maxRows+1);
63 for(int i=0; i<packed.length; i++){
64 if(i<packed0.length){
65 packed[i]=packed0[i];
66 }else{
67 packed[i]=KillSwitch.allocFloat1D(maxColumns+1);
68 }
69 }
70 //Fill column 0 with insertions
71 prefillLeftColumnStartingAt(maxRows0);
72 }
73 prefillTopRow();
74 }
75
76 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
77 * Will not fill areas that cannot match minScore */
78 @Override
79 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
80 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore);
81 }
82
83 @Override
84 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
85 return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999);
86 }
87
88 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
89 * Min score is optional */
90 @Override
91 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
92 initialize(read.length, refEndLoc-refStartLoc+1);
93
94 assert(refWeights==null || refWeights.length==ref.length);
95
96 //temporary, for finding a bug
97 if(rows>maxRows || columns>maxColumns){
98 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
99 }
100
101 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
102 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
103
104 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
105 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
106
107 final int refOffset=refStartLoc-1;
108 for(int row=1; row<=rows; row++){
109
110 final byte qBase=read[row-1];
111 for(int col=1; col<=columns; col++){
112
113 final int rpos=refOffset+col;
114 final byte rBase=ref[rpos];
115
116 final boolean match=(qBase==rBase);
117 final boolean defined=(qBase!='N' && rBase!='N');
118
119 final float scoreFromDiag=packed[row-1][col-1];
120 final float scoreFromDel=packed[row][col-1];
121 final float scoreFromIns=packed[row-1][col];
122
123 final float diagScoreM=POINTS_MATCH;
124 final float diagScoreS=POINTS_SUB;
125 final float delScore=scoreFromDel+POINTS_DEL;//*delWeights[rpos];
126 final float insScore=scoreFromIns+POINTS_INS;//*insWeights[rpos];
127
128 // assert(delWeights[rpos]==1f) : Arrays.toString(delWeights);
129 // assert(insWeights[rpos]==1f);
130 // assert(refWeights[rpos]==1f);
131
132 // final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
133 float diagScore=(match ? diagScoreM : diagScoreS);
134 diagScore=scoreFromDiag+(defined ? diagScore : POINTS_NOREF)*refWeights[rpos];
135
136 float score=diagScore>=delScore ? diagScore : delScore;
137 score=score>=insScore ? score : insScore;
138
139 packed[row][col]=score;
140 }
141 //iterationsUnlimited+=columns;
142 }
143
144
145 int maxCol=-1;
146 int maxState=-1;
147 float maxStart=-1;
148 float maxScore=Integer.MIN_VALUE;
149
150 for(int col=1; col<=columns; col++){
151 float x=packed[rows][col];
152 if(x>maxScore){
153 maxScore=x;
154 maxCol=col;
155
156 // assert(rows-1<read.length) : (rows-1)+", "+read.length;
157 // assert(refOffset+col<ref.length) : refOffset+", "+col+", "+ref.length;
158 // maxState=getState(rows, col, read[rows-1], ref[refOffset+col], refWeights[refOffset+col], insWeights[refOffset+col], delWeights[refOffset+col]);
159 maxState=getState(rows, col, read[rows-1], ref[refOffset+col], refWeights[refOffset+col]);
160 maxStart=x;
161 }
162 }
163
164 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore);
165 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, (int)maxScore, (int)maxStart};
166 }
167
168 int getState(int row, int col, byte q, byte r, float refWeight, float insWeight, float delWeight){
169 final boolean match=(q==r);
170 final boolean defined=(q!='N' && r!='N');
171
172 final float scoreFromDiag=packed[row-1][col-1];
173 final float scoreFromDel=packed[row][col-1];
174 final float scoreFromIns=packed[row-1][col];
175
176 final float diagScoreM=POINTS_MATCH;
177 final float diagScoreS=POINTS_SUB;
178 final float delScore=scoreFromDel+POINTS_DEL*delWeight;
179 final float insScore=scoreFromIns+POINTS_INS*insWeight;
180
181 final float diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF)*refWeight;
182
183 // int score2=diagScore>=delScore ? diagScore : delScore;
184 // score2=score>=insScore ? score : insScore;
185
186 // assert(score==score2) : score+", "+score2;
187
188 if(diagScore>=delScore && diagScore>=insScore){
189 return defined ? match ? MODE_MATCH : MODE_SUB : MODE_N;
190 }else if(delScore>=insScore){
191 return MODE_DEL;
192 }
193 return MODE_INS;
194 }
195
196 int getState(int row, int col, byte q, byte r, float refWeight){
197 final boolean match=(q==r);
198 final boolean defined=(q!='N' && r!='N');
199
200 final float scoreFromDiag=packed[row-1][col-1];
201 final float scoreFromDel=packed[row][col-1];
202 final float scoreFromIns=packed[row-1][col];
203
204 final float diagScoreM=POINTS_MATCH;
205 final float diagScoreS=POINTS_SUB;
206 final float delScore=scoreFromDel+POINTS_DEL;
207 final float insScore=scoreFromIns+POINTS_INS;
208
209 final float diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF)*refWeight;
210
211 // int score2=diagScore>=delScore ? diagScore : delScore;
212 // score2=score>=insScore ? score : insScore;
213
214 // assert(score==score2) : score+", "+score2;
215
216 if(diagScore>=delScore && diagScore>=insScore){
217 return defined ? match ? MODE_MATCH : MODE_SUB : MODE_N;
218 }else if(delScore>=insScore){
219 return MODE_DEL;
220 }
221 return MODE_INS;
222 }
223
224
225 /** Generates the match string.
226 * State is NOT used. */
227 @Override
228 public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
229 // assert(false);
230 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
231 assert(row==rows);
232
233 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
234 int outPos=0;
235
236 // assert(state==(packed[row][col]&MODEMASK));
237
238 while(row>0 && col>0){
239 byte q=query[row-1];
240 int rpos=refStartLoc+col-1;
241 byte r=ref[rpos];
242 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
243 // state=getState(row, col, q, r, refWeights[rpos], insWeights[rpos], delWeights[rpos]);
244 state=getState(row, col, q, r, refWeights[rpos]);
245 if(state==MODE_MATCH){
246 col--;
247 row--;
248 out[outPos]=defined ? (byte)'m' : (byte)'N';
249 }else if(state==MODE_SUB){
250 col--;
251 row--;
252 out[outPos]=defined ? (byte)'S' : (byte)'N';
253 }else if(state==MODE_N){
254 col--;
255 row--;
256 out[outPos]='N';
257 }else if(state==MODE_DEL){
258 col--;
259 out[outPos]='D';
260 }else if(state==MODE_INS){
261 row--;
262 // out[outPos]='I';
263 // out[outPos]=(col<0 || col>=columns ? (byte)'C' : (byte)'I');
264 if(col>=0 && col<columns){
265 out[outPos]='I';
266 }else{
267 out[outPos]='C';
268 col--;
269 }
270 }else{
271 assert(false) : state;
272 }
273 outPos++;
274 }
275
276 assert(row==0 || col==0);
277 if(col!=row){
278 while(row>0){
279 out[outPos]='C';
280 outPos++;
281 row--;
282 col--;
283 }
284 if(col>0){
285 //do nothing
286 }
287 }
288
289 //Shrink and reverse the string
290 byte[] out2=new byte[outPos];
291 for(int i=0; i<outPos; i++){
292 out2[i]=out[outPos-i-1];
293 }
294 out=null;
295
296 return out2;
297 }
298
299 @Override
300 /** Generates identity;
301 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
302 public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
303
304 // assert(false);
305 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
306 assert(row==rows);
307
308 // assert(state==(packed[row][col]&MODEMASK));
309 int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
310
311 while(row>0 && col>0){
312 byte q=query[row-1];
313 int rpos=refStartLoc+col-1;
314 byte r=ref[rpos];
315 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
316 // state=getState(row, col, q, r, refWeights[rpos], insWeights[rpos], delWeights[rpos]);
317 state=getState(row, col, q, r, refWeights[rpos]);
318 if(state==MODE_MATCH){
319 col--;
320 row--;
321 match+=(defined ? 1 : 0);
322 noref+=(defined ? 0 : 1);
323 }else if(state==MODE_SUB){
324 col--;
325 row--;
326 sub+=(defined ? 1 : 0);
327 noref+=(defined ? 0 : 1);
328 }else if(state==MODE_N){
329 col--;
330 row--;
331 noref++;
332 }else if(state==MODE_DEL){
333 col--;
334 del++;
335 }else if(state==MODE_INS){
336 row--;
337 boolean edge=(col<=1 || col>=columns);
338 ins+=(edge ? 0 : 1);
339 clip+=(edge ? 1 : 0);
340 }else{
341 assert(false) : state;
342 }
343 }
344
345 assert(row==0 || col==0);
346 if(col!=row){//Not sure what this is doing
347 while(row>0){
348 clip++;
349 row--;
350 col--;
351 }
352 if(col>0){
353 //do nothing
354 }
355 }
356
357 if(extra!=null){
358 assert(extra.length==5);
359 extra[0]=match;
360 extra[1]=sub;
361 extra[2]=del;
362 extra[3]=ins;
363 extra[4]=noref;
364 extra[5]=clip;
365 }
366
367 float len=match+sub+ins+del+noref*0.1f;
368 float id=match/Tools.max(1.0f, len);
369 return id;
370 }
371
372 /** @return {score, bestRefStart, bestRefStop} */
373 @Override
374 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
375 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){
376
377 int row=maxRow;
378 int col=maxCol;
379 int state=maxState;
380
381 assert(maxState>=0 && maxState<packed.length) :
382 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
383 assert(maxRow>=0 && maxRow<packed.length) :
384 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
385 assert(maxCol>=0 && maxCol<packed[0].length) :
386 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
387
388 float score=packed[maxRow][maxCol]; //Or zero, if it is to be recalculated
389
390 if(row<rows){
391 int difR=rows-row;
392 int difC=columns-col;
393
394 while(difR>difC){
395 score+=POINTS_NOREF;
396 difR--;
397 }
398
399 row+=difR;
400 col+=difR;
401
402 }
403
404 assert(refStartLoc<=refEndLoc);
405 assert(row==rows);
406
407
408 final int bestRefStop=refStartLoc+col-1;
409
410 while(row>0 && col>0){
411 final byte q=read[row-1];
412 int rpos=refStartLoc+col-1;
413 final byte r=ref[rpos];
414 // final boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
415 // state=getState(row, col, q, r, refWeights[rpos], insWeights[rpos], delWeights[rpos]);
416 state=getState(row, col, q, r, refWeights[rpos]);
417 if(state==MODE_MATCH){
418 col--;
419 row--;
420 }else if(state==MODE_SUB){
421 col--;
422 row--;
423 }else if(state==MODE_N){
424 col--;
425 row--;
426 }else if(state==MODE_DEL){
427 col--;
428 }else if(state==MODE_INS){
429 row--;
430 }else{
431 assert(false) : state;
432 }
433 }
434 // assert(false) : row+", "+col;
435 if(row>col){
436 col-=row;
437 }
438
439 final int bestRefStart=refStartLoc+col;
440
441 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart);
442 int[] rvec;
443 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
444 int padLeft=Tools.max(0, refStartLoc-bestRefStart);
445 int padRight=Tools.max(0, bestRefStop-refEndLoc);
446 rvec=new int[] {(int)score, bestRefStart, bestRefStop, padLeft, padRight};
447 }else{
448 rvec=new int[] {(int)score, bestRefStart, bestRefStop};
449 }
450 return rvec;
451 }
452
453
454 /** Will not fill areas that cannot match minScore.
455 * @return {score, bestRefStart, bestRefStop} */
456 @Override
457 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
458 int a=Tools.max(0, refStartLoc);
459 int b=Tools.min(ref.length-1, refEndLoc);
460 assert(b>=a);
461
462 if(b-a>=maxColumns){
463 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
464 assert(false) : refStartLoc+", "+refEndLoc;
465 b=Tools.min(ref.length-1, a+maxColumns-1);
466 }
467 int[] max=fillLimited(read, ref, a, b, minScore);
468 // return max==null ? null : new int[] {max[3], 0, max[1]};
469
470 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/));
471
472 return score;
473 }
474
475 public static final String toString(byte[] ref, int startLoc, int stopLoc){
476 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
477 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
478 return sb.toString();
479 }
480
481 // public static int calcDelScore(int len){
482 // if(len<=0){return 0;}
483 // int score=POINTS_DEL;
484 // if(len>1){
485 // score+=(len-1)*POINTS_DEL2;
486 // }
487 // return score;
488 // }
489
490 // public int maxScoreByIdentity(int len, float identity){
491 // assert(identity>=0 && identity<=1);
492 // return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
493 // }
494
495 @Override
496 public int minScoreByIdentity(int len, float identity){
497 assert(identity>=0 && identity<=1);
498
499 int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
500 int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS));
501 int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL));
502 return Tools.min(a, b, c);
503 }
504
505 private static float calcDelScore(int len){
506 if(len<=0){return 0;}
507 float score=POINTS_DEL*len;
508 return score;
509 }
510 //
511 // public static int calcInsScore(int len){
512 // if(len<=0){return 0;}
513 // int score=POINTS_INS;
514 //
515 // if(len>1){
516 // score+=(len-1)*POINTS_INS2;
517 // }
518 // return score;
519 // }
520 //
521 // private static int calcInsScoreOffset(int len){
522 // if(len<=0){return 0;}
523 // int score=POINTS_INS;
524 //
525 // if(len>1){
526 // score+=(len-1)*POINTS_INS2;
527 // }
528 // return score;
529 // }
530
531 @Override
532 public int rows(){return rows;}
533 @Override
534 public int columns(){return columns;}
535
536 public void setWeights(float[] refWeights_, float[] insWeights_, float[] delWeights_){
537 refWeights=refWeights_;
538 // insWeights=insWeights_;
539 // delWeights=delWeights_;
540 }
541
542 public void setWeights(float[] refWeights_){
543 refWeights=refWeights_;
544 }
545
546 private int maxRows;
547 private int maxColumns;
548
549 private float[][] packed;
550
551 private float[] refWeights;
552 //These don't seem to help.
553 // private float[] insWeights;
554 // private float[] delWeights;
555
556 public static final int MAX_SCORE=Integer.MAX_VALUE-2000;
557 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
558
559 //For some reason changing MODE_DEL from 1 to 0 breaks everything
560 private static final byte MODE_DEL=1;
561 private static final byte MODE_INS=2;
562 private static final byte MODE_SUB=3;
563 private static final byte MODE_MATCH=4;
564 private static final byte MODE_N=5;
565
566 public static final float POINTS_NOREF=-20;
567 public static final float POINTS_MATCH=100;
568 public static final float POINTS_SUB=-50;
569 public static final float POINTS_INS=-121;
570 public static final float POINTS_DEL=-111;
571
572 public static final int BAD=MIN_SCORE-1;
573
574 private int rows;
575 private int columns;
576
577 // public long iterationsLimited=0;
578 // public long iterationsUnlimited=0;
579
580 public boolean verbose=false;
581 public boolean verbose2=false;
582
583 }