comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerFlat2Amino.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package aligner;
2
3 import dna.AminoAcid;
4 import shared.KillSwitch;
5 import shared.Tools;
6
7 /**
8 * Based on SSAFlat, but with previous state pointers removed. */
9 public final class SingleStateAlignerFlat2Amino implements Aligner {
10
11
12 public SingleStateAlignerFlat2Amino(){}
13
14 private void prefillTopRow(){
15 final int[] header=packed[0];
16 final int qlen=rows;
17 for(int i=0; i<=columns; i++){
18 int x=columns-i+1;
19 int qbases=qlen-x;
20
21 //Minimal points to prefer a leftmost alignment
22 header[i]=qbases<=0 ? 0 : -qbases;
23
24 //Forces consumption of query, but does not allow for insertions...
25 // header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases);
26 }
27 }
28
29 private void prefillLeftColumnStartingAt(int i){
30 packed[0][0]=MODE_MATCH;
31 i=Tools.max(1, i);
32 for(int score=MODE_INS+(POINTS_INS*i); i<=maxRows; i++){//Fill column 0 with insertions
33 score+=POINTS_INS;
34 packed[i][0]=score;
35 }
36 }
37
38 private void initialize(int rows_, int columns_){
39 rows=rows_;
40 columns=columns_;
41 if(rows<=maxRows && columns<=maxColumns){
42 prefillTopRow();
43 // prefillLeftColumn();
44 return;
45 }
46
47 final int maxRows0=maxRows;
48 final int maxColumns0=maxColumns;
49 final int[][] packed0=packed;
50
51 //Monotonic increase
52 maxRows=Tools.max(maxRows, rows+10);
53 maxColumns=Tools.max(maxColumns, columns+10);
54
55 if(packed==null || maxColumns>maxColumns0){//Make a new matrix
56 packed=KillSwitch.allocInt2D(maxRows+1, maxColumns+1);
57 prefillLeftColumnStartingAt(1);
58 }else{//Copy old rows
59 assert(maxRows0>0 && maxColumns0>0);
60 assert(maxRows>maxRows0 && maxColumns<=maxColumns0);
61 packed=KillSwitch.allocInt2D(maxRows+1);
62 for(int i=0; i<packed.length; i++){
63 if(i<packed0.length){
64 packed[i]=packed0[i];
65 }else{
66 packed[i]=KillSwitch.allocInt1D(maxColumns+1);
67 }
68 }
69 //Fill column 0 with insertions
70 prefillLeftColumnStartingAt(maxRows0);
71 }
72 prefillTopRow();
73 }
74
75 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
76 * Will not fill areas that cannot match minScore */
77 @Override
78 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
79 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore);
80 }
81
82 @Override
83 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
84 return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999);
85 }
86
87 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
88 * Min score is optional */
89 @Override
90 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
91 initialize(read.length, refEndLoc-refStartLoc+1);
92
93 //temporary, for finding a bug
94 if(rows>maxRows || columns>maxColumns){
95 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
96 }
97
98 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
99 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
100
101 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
102 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
103
104 final int refOffset=refStartLoc-1;
105 for(int row=1; row<=rows; row++){
106
107 final byte qBase=read[row-1];
108 for(int col=1; col<=columns; col++){
109
110 final byte rBase=ref[refOffset+col];
111
112 final boolean match=(qBase==rBase);
113 final boolean defined=(AminoAcid.isFullyDefinedAA(qBase) && AminoAcid.isFullyDefinedAA(rBase));
114
115 final int scoreFromDiag=packed[row-1][col-1];
116 final int scoreFromDel=packed[row][col-1];
117 final int scoreFromIns=packed[row-1][col];
118
119 final int diagScoreM=POINTS_MATCH;
120 final int diagScoreS=POINTS_SUB;
121 final int delScore=scoreFromDel+POINTS_DEL;
122 final int insScore=scoreFromIns+POINTS_INS;
123
124 // final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
125 int diagScore=(match ? diagScoreM : diagScoreS);
126 diagScore=scoreFromDiag+(defined ? diagScore : POINTS_NOREF);
127
128 int score=diagScore>=delScore ? diagScore : delScore;
129 score=score>=insScore ? score : insScore;
130
131 packed[row][col]=score;
132 }
133 //iterationsUnlimited+=columns;
134 }
135
136
137 int maxCol=-1;
138 int maxState=-1;
139 int maxStart=-1;
140 int maxScore=Integer.MIN_VALUE;
141
142 for(int col=1; col<=columns; col++){
143 int x=packed[rows][col];
144 if(x>maxScore){
145 maxScore=x;
146 maxCol=col;
147
148 // assert(rows-1<read.length) : (rows-1)+", "+read.length;
149 // assert(refOffset+col<ref.length) : refOffset+", "+col+", "+ref.length;
150 maxState=getState(rows, col, read[rows-1], ref[refOffset+col]);
151 maxStart=x;
152 }
153 }
154
155 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore);
156 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart};
157 }
158
159 int getState(int row, int col, byte q, byte r){//zxvzxcv TODO: Fix - needs to find max
160 final boolean match=(q==r);
161 final boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
162
163 final int scoreFromDiag=packed[row-1][col-1];
164 final int scoreFromDel=packed[row][col-1];
165 final int scoreFromIns=packed[row-1][col];
166 // final int score=packed[row][col];
167
168 final int diagScoreM=POINTS_MATCH;
169 final int diagScoreS=POINTS_SUB;
170 final int delScore=scoreFromDel+POINTS_DEL;
171 final int insScore=scoreFromIns+POINTS_INS;
172
173 final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
174
175 // int score2=diagScore>=delScore ? diagScore : delScore;
176 // score2=score>=insScore ? score : insScore;
177
178 // assert(score==score2) : score+", "+score2;
179
180 if(diagScore>=delScore && diagScore>=insScore){
181 return defined ? match ? MODE_MATCH : MODE_SUB : MODE_N;
182 }else if(delScore>=insScore){
183 return MODE_DEL;
184 }
185 return MODE_INS;
186 }
187
188 /** Generates the match string */
189 @Override
190 public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
191 // assert(false);
192 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
193 assert(row==rows);
194
195 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
196 int outPos=0;
197
198 // assert(state==(packed[row][col]&MODEMASK));
199
200 while(row>0 && col>0){
201 byte q=query[row-1];
202 byte r=ref[refStartLoc+col-1];
203 boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
204 state=getState(row, col, q, r);
205 // assert(defined) : state+", "+(int)q+", "+(int)r+", "+new String(query);
206 // assert(state!=MODE_N) : state+", "+Character.toString(q)+", "+Character.toString(r)+", "+new String(query);
207 if(state==MODE_MATCH){
208 col--;
209 row--;
210 out[outPos]=defined ? (byte)'m' : (byte)'N';
211 }else if(state==MODE_SUB){
212 col--;
213 row--;
214 out[outPos]=defined ? (byte)'S' : (byte)'N';
215 }else if(state==MODE_N){
216 col--;
217 row--;
218 out[outPos]='N';
219 }else if(state==MODE_DEL){
220 col--;
221 out[outPos]='D';
222 }else if(state==MODE_INS){
223 row--;
224 // out[outPos]='I';
225 if(col>=0 && col<columns){
226 out[outPos]='I';
227 }else{
228 out[outPos]='C';
229 col--;
230 }
231 }else{
232 assert(false) : state;
233 }
234 outPos++;
235 }
236
237 assert(row==0 || col==0);
238 if(col!=row){//Not sure what this is doing
239 while(row>0){
240 out[outPos]='C';
241 outPos++;
242 row--;
243 col--;
244 }
245 if(col>0){
246 //do nothing
247 }
248 }
249
250 //Shrink and reverse the string
251 byte[] out2=new byte[outPos];
252 for(int i=0; i<outPos; i++){
253 out2[i]=out[outPos-i-1];
254 }
255 out=null;
256
257 return out2;
258 }
259
260 @Override
261 /** Generates identity;
262 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
263 public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
264
265 // assert(false);
266 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
267 assert(row==rows);
268
269 // assert(state==(packed[row][col]&MODEMASK));
270 int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
271
272 while(row>0 && col>0){
273 byte q=query[row-1];
274 byte r=ref[refStartLoc+col-1];
275 boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
276 state=getState(row, col, q, r);
277 if(state==MODE_MATCH){
278 col--;
279 row--;
280 match+=(defined ? 1 : 0);
281 noref+=(defined ? 0 : 1);
282 }else if(state==MODE_SUB){
283 col--;
284 row--;
285 sub+=(defined ? 1 : 0);
286 noref+=(defined ? 0 : 1);
287 }else if(state==MODE_N){
288 col--;
289 row--;
290 noref++;
291 }else if(state==MODE_DEL){
292 col--;
293 del++;
294 }else if(state==MODE_INS){
295 row--;
296 boolean edge=(col<=1 || col>=columns);
297 ins+=(edge ? 0 : 1);
298 clip+=(edge ? 1 : 0);
299 }else{
300 assert(false) : state;
301 }
302 }
303
304 assert(row==0 || col==0);
305 if(col!=row){//Not sure what this is doing
306 while(row>0){
307 clip++;
308 row--;
309 col--;
310 }
311 if(col>0){
312 //do nothing
313 }
314 }
315
316 if(extra!=null){
317 assert(extra.length==5);
318 extra[0]=match;
319 extra[1]=sub;
320 extra[2]=del;
321 extra[3]=ins;
322 extra[4]=noref;
323 extra[5]=clip;
324 }
325
326 float len=match+sub+ins+del+noref*0.1f;
327 float id=match/Tools.max(1.0f, len);
328 return id;
329 }
330
331 /** Generates identity;
332 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
333 public float tracebackIdentityAmino(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
334
335 // assert(false);
336 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
337 assert(row==rows);
338
339 // assert(state==(packed[row][col]&MODEMASK));
340 int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
341
342 while(row>0 && col>0){
343 byte q=query[row-1];
344 byte r=ref[refStartLoc+col-1];
345 boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
346 state=getState(row, col, q, r);
347 if(state==MODE_MATCH){
348 col--;
349 row--;
350 match+=(defined ? 1 : 0);
351 noref+=(defined ? 0 : 1);
352 }else if(state==MODE_SUB){
353 col--;
354 row--;
355 sub+=(defined ? 1 : 0);
356 noref+=(defined ? 0 : 1);
357 }else if(state==MODE_N){
358 col--;
359 row--;
360 noref++;
361 }else if(state==MODE_DEL){
362 col--;
363 del++;
364 }else if(state==MODE_INS){
365 row--;
366 boolean edge=(col<=1 || col>=columns);
367 ins+=(edge ? 0 : 1);
368 clip+=(edge ? 1 : 0);
369 }else{
370 assert(false) : state;
371 }
372 }
373
374 assert(row==0 || col==0);
375 if(col!=row){//Not sure what this is doing
376 while(row>0){
377 clip++;
378 row--;
379 col--;
380 }
381 if(col>0){
382 //do nothing
383 }
384 }
385
386 if(extra!=null){
387 assert(extra.length==5);
388 extra[0]=match;
389 extra[1]=sub;
390 extra[2]=del;
391 extra[3]=ins;
392 extra[4]=noref;
393 extra[5]=clip;
394 }
395
396 float len=match+sub+ins+del+noref*0.1f;
397 float id=match/Tools.max(1.0f, len);
398 return id;
399 }
400
401 /** @return {score, bestRefStart, bestRefStop} */
402 @Override
403 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
404 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){
405
406 int row=maxRow;
407 int col=maxCol;
408 int state=maxState;
409
410 assert(maxState>=0 && maxState<packed.length) :
411 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
412 assert(maxRow>=0 && maxRow<packed.length) :
413 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
414 assert(maxCol>=0 && maxCol<packed[0].length) :
415 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
416
417 int score=packed[maxRow][maxCol]; //Or zero, if it is to be recalculated
418
419 if(row<rows){
420 int difR=rows-row;
421 int difC=columns-col;
422
423 while(difR>difC){
424 score+=POINTS_NOREF;
425 difR--;
426 }
427
428 row+=difR;
429 col+=difR;
430
431 }
432
433 assert(refStartLoc<=refEndLoc);
434 assert(row==rows);
435
436
437 final int bestRefStop=refStartLoc+col-1;
438
439 while(row>0 && col>0){
440 final byte q=read[row-1];
441 final byte r=ref[refStartLoc+col-1];
442 // final boolean defined=(AminoAcid.isFullyDefinedAA(q) && AminoAcid.isFullyDefinedAA(r));
443 state=getState(row, col, q, r);
444 if(state==MODE_MATCH){
445 col--;
446 row--;
447 }else if(state==MODE_SUB){
448 col--;
449 row--;
450 }else if(state==MODE_N){
451 col--;
452 row--;
453 }else if(state==MODE_DEL){
454 col--;
455 }else if(state==MODE_INS){
456 row--;
457 }else{
458 assert(false) : state;
459 }
460 }
461 // assert(false) : row+", "+col;
462 if(row>col){
463 col-=row;
464 }
465
466 final int bestRefStart=refStartLoc+col;
467
468 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart);
469 int[] rvec;
470 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
471 int padLeft=Tools.max(0, refStartLoc-bestRefStart);
472 int padRight=Tools.max(0, bestRefStop-refEndLoc);
473 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
474 }else{
475 rvec=new int[] {score, bestRefStart, bestRefStop};
476 }
477 return rvec;
478 }
479
480
481 /** Will not fill areas that cannot match minScore.
482 * @return {score, bestRefStart, bestRefStop} */
483 @Override
484 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
485 int a=Tools.max(0, refStartLoc);
486 int b=Tools.min(ref.length-1, refEndLoc);
487 assert(b>=a);
488
489 if(b-a>=maxColumns){
490 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
491 assert(false) : refStartLoc+", "+refEndLoc;
492 b=Tools.min(ref.length-1, a+maxColumns-1);
493 }
494 int[] max=fillLimited(read, ref, a, b, minScore);
495 // return max==null ? null : new int[] {max[3], 0, max[1]};
496
497 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/));
498
499 return score;
500 }
501
502 public static final String toString(byte[] ref, int startLoc, int stopLoc){
503 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
504 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
505 return sb.toString();
506 }
507
508 // public static int calcDelScore(int len){
509 // if(len<=0){return 0;}
510 // int score=POINTS_DEL;
511 // if(len>1){
512 // score+=(len-1)*POINTS_DEL2;
513 // }
514 // return score;
515 // }
516
517 // public int maxScoreByIdentity(int len, float identity){
518 // assert(identity>=0 && identity<=1);
519 // return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
520 // }
521
522 @Override
523 public int minScoreByIdentity(int len, float identity){
524 assert(identity>=0 && identity<=1);
525
526 int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
527 int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS));
528 int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL));
529 return Tools.min(a, b, c);
530 }
531
532 private static int calcDelScore(int len){
533 if(len<=0){return 0;}
534 int score=POINTS_DEL*len;
535 return score;
536 }
537
538 @Override
539 public int rows(){return rows;}
540 @Override
541 public int columns(){return columns;}
542
543
544 private int maxRows;
545 private int maxColumns;
546
547 private int[][] packed;
548
549 public static final int MAX_SCORE=Integer.MAX_VALUE-2000;
550 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
551
552 //For some reason changing MODE_DEL from 1 to 0 breaks everything
553 private static final byte MODE_DEL=1;
554 private static final byte MODE_INS=2;
555 private static final byte MODE_SUB=3;
556 private static final byte MODE_MATCH=4;
557 private static final byte MODE_N=5;
558
559 public static final int POINTS_NOREF=-15;
560 public static final int POINTS_MATCH=100;
561 public static final int POINTS_SUB=-50;
562 public static final int POINTS_INS=-121;
563 public static final int POINTS_DEL=-111;
564
565 // public static final int POINTS_NOREF=-100000;
566 // public static final int POINTS_MATCH=100;
567 // public static final int POINTS_SUB=-100;
568 // public static final int POINTS_INS=-100;
569 // public static final int POINTS_DEL=-100;
570
571 public static final int BAD=MIN_SCORE-1;
572
573 private int rows;
574 private int columns;
575
576 // public long iterationsLimited=0;
577 // public long iterationsUnlimited=0;
578
579 public boolean verbose=false;
580 public boolean verbose2=false;
581
582 }