Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerPacBioAdapter.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package aligner; | |
2 | |
3 import java.util.Arrays; | |
4 | |
5 import shared.KillSwitch; | |
6 import shared.Tools; | |
7 | |
8 /** | |
9 * Based on MSA9PBA, but reduced to a single matrix. */ | |
10 public final class SingleStateAlignerPacBioAdapter { | |
11 | |
12 | |
13 public SingleStateAlignerPacBioAdapter(int maxRows_, int maxColumns_, int qlen){ | |
14 // assert(maxColumns_>=200); | |
15 // assert(maxRows_>=200); | |
16 maxRows=maxRows_; | |
17 maxColumns=maxColumns_; | |
18 packed=new int[maxRows+1][maxColumns+1]; | |
19 | |
20 insScoreArray=KillSwitch.allocInt1D(5); | |
21 delScoreArray=KillSwitch.allocInt1D(5); | |
22 matchScoreArray=KillSwitch.allocInt1D(5); | |
23 subScoreArray=KillSwitch.allocInt1D(5); | |
24 | |
25 Arrays.fill(insScoreArray, POINTSoff_INS|MODE_INS); | |
26 Arrays.fill(delScoreArray, POINTSoff_DEL|MODE_DEL); | |
27 Arrays.fill(matchScoreArray, POINTSoff_MATCH|MODE_MATCH); | |
28 Arrays.fill(subScoreArray, POINTSoff_SUB|MODE_SUB); | |
29 | |
30 insScoreArray[MODE_INS]=POINTSoff_INS2|MODE_INS; | |
31 delScoreArray[MODE_DEL]=POINTSoff_DEL2|MODE_DEL; | |
32 matchScoreArray[MODE_MATCH]=POINTSoff_MATCH2|MODE_MATCH; | |
33 subScoreArray[MODE_SUB]=POINTSoff_SUB2|MODE_SUB; | |
34 | |
35 vertLimit=new int[maxRows+1]; | |
36 horizLimit=new int[maxColumns+1]; | |
37 Arrays.fill(vertLimit, BADoff); | |
38 Arrays.fill(horizLimit, BADoff); | |
39 | |
40 // for(int i=0; i<maxColumns+1; i++){ | |
41 // scores[0][i]=0-i; | |
42 // } | |
43 | |
44 for(int i=1; i<=maxRows; i++){ | |
45 for(int j=1; j<packed[i].length; j++){ | |
46 packed[i][j]|=BADoff; | |
47 } | |
48 } | |
49 for(int i=0; i<=maxRows; i++){ | |
50 | |
51 int prevScore=(i<2 ? 0 : packed[i-1][0]); | |
52 int score=(i<2 ? (i*POINTSoff_INS) : prevScore+POINTSoff_INS2); | |
53 | |
54 packed[i][0]=score; | |
55 } | |
56 for(int i=0; i<packed[0].length; i++){ | |
57 packed[0][i]=packed[0][i]|(i<<STARTOFFSET); | |
58 int x=packed[0].length-i; | |
59 int qbases=qlen-x; | |
60 if(qbases>0){ | |
61 packed[0][i]|=calcDelScoreOffset(qbases); //Forces consumption of query | |
62 } | |
63 } | |
64 } | |
65 | |
66 | |
67 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
68 * Will not fill areas that cannot match minScore */ | |
69 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ | |
70 return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore); | |
71 } | |
72 | |
73 | |
74 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
75 * Will not fill areas that cannot match minScore */ | |
76 private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ | |
77 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore); | |
78 } | |
79 | |
80 | |
81 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
82 * Does not require a min score (ie, same as old method) */ | |
83 private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, final int minScore){ | |
84 rows=read.length; | |
85 columns=refEndLoc-refStartLoc+1; | |
86 | |
87 //temporary, for finding a bug | |
88 if(rows>maxRows || columns>maxColumns){ | |
89 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); | |
90 } | |
91 | |
92 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; | |
93 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; | |
94 | |
95 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; | |
96 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length; | |
97 | |
98 final int refOffset=refStartLoc-1; | |
99 for(int row=1; row<=rows; row++){ | |
100 | |
101 final byte qBase=read[row-1]; | |
102 for(int col=1; col<=columns; col++){ | |
103 iterationsUnlimited++; | |
104 | |
105 final byte rBase=ref[refOffset+col]; | |
106 | |
107 final boolean match=(qBase==rBase);//Note that qBase will never be 'N' | |
108 | |
109 final int valueFromDiag=packed[row-1][col-1]; | |
110 final int valueFromDel=packed[row][col-1]; | |
111 final int valueFromIns=packed[row-1][col]; | |
112 final int stateFromDiag=valueFromDiag&MODEMASK; | |
113 final int scoreFromDiag=valueFromDiag&HIGHMASK; | |
114 final int stateFromDel=valueFromDel&MODEMASK; | |
115 final int scoreFromDel=valueFromDel&HIGHMASK; | |
116 final int stateFromIns=valueFromIns&MODEMASK; | |
117 final int scoreFromIns=valueFromIns&HIGHMASK; | |
118 | |
119 // final boolean prevMatch=stateFromDiag==MODE_MATCH; | |
120 // final boolean prevSub=stateFromDiag==MODE_SUB; | |
121 // final boolean prevDel=stateFromDel==MODE_DEL; | |
122 // final boolean prevIns=stateFromIns==MODE_INS; | |
123 | |
124 //Old conditional code, replaced by faster arrays | |
125 // final int diagScoreM=MODE_MATCH|(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH); | |
126 // final int diagScoreS=MODE_SUB|(prevSub ? POINTSoff_SUB2 : POINTSoff_SUB); | |
127 // final int delScore=scoreFromDel+(prevDel ? POINTSoff_DEL2 : POINTSoff_DEL)|MODE_DEL; | |
128 // final int insScore=scoreFromIns+(prevIns ? POINTSoff_INS2 : POINTSoff_INS)|MODE_INS; | |
129 | |
130 final int diagScoreM=matchScoreArray[stateFromDiag]; | |
131 final int diagScoreS=subScoreArray[stateFromDiag]; | |
132 final int delScore=scoreFromDel+delScoreArray[stateFromDel]; | |
133 final int insScore=scoreFromIns+insScoreArray[stateFromIns]; | |
134 | |
135 int diagScore=(match ? diagScoreM : diagScoreS); | |
136 diagScore=scoreFromDiag+(rBase!='N' ? diagScore : POINTSoff_NOREF_MODE_SUB); | |
137 | |
138 int score=diagScore>=delScore ? diagScore : delScore; | |
139 score=score>=insScore ? score : insScore; | |
140 | |
141 packed[row][col]=score; | |
142 } | |
143 } | |
144 | |
145 | |
146 int maxCol=-1; | |
147 int maxState=-1; | |
148 int maxStart=-1; | |
149 int maxScore=Integer.MIN_VALUE; | |
150 | |
151 for(int col=1; col<=columns; col++){ | |
152 int x=packed[rows][col]; | |
153 if((x&SCOREMASK)>maxScore){ | |
154 maxScore=x; | |
155 maxCol=col; | |
156 maxState=x&MODEMASK; | |
157 maxStart=x&STARTMASK; | |
158 } | |
159 } | |
160 maxScore>>=SCOREOFFSET; | |
161 | |
162 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore); | |
163 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
164 } | |
165 | |
166 | |
167 | |
168 /** Generates the match string */ | |
169 public final byte[] traceback(int refStartLoc, int refEndLoc, int row, int col, int state){ | |
170 // assert(false); | |
171 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; | |
172 assert(row==rows); | |
173 | |
174 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". | |
175 int outPos=0; | |
176 | |
177 // assert(state==(packed[row][col]&MODEMASK)); | |
178 | |
179 while(row>0 && col>0){ | |
180 state=(packed[row][col]&MODEMASK); | |
181 if(state==MODE_MATCH){ | |
182 col--; | |
183 row--; | |
184 out[outPos]='m'; | |
185 }else if(state==MODE_SUB){ | |
186 col--; | |
187 row--; | |
188 out[outPos]='S'; | |
189 }else if(state==MODE_DEL){ | |
190 col--; | |
191 out[outPos]='D'; | |
192 }else if(state==MODE_INS){ | |
193 row--; | |
194 out[outPos]='I'; | |
195 }else{ | |
196 assert(false) : state; | |
197 } | |
198 outPos++; | |
199 } | |
200 | |
201 assert(row==0 || col==0); | |
202 if(col!=row){ | |
203 while(row>0){ | |
204 out[outPos]='X'; | |
205 outPos++; | |
206 row--; | |
207 col--; | |
208 } | |
209 if(col>0){ | |
210 //do nothing | |
211 } | |
212 } | |
213 | |
214 //Shrink and reverse the string | |
215 byte[] out2=new byte[outPos]; | |
216 for(int i=0; i<outPos; i++){ | |
217 out2[i]=out[outPos-i-1]; | |
218 } | |
219 out=null; | |
220 | |
221 return out2; | |
222 } | |
223 | |
224 /** @return {score, bestRefStart, bestRefStop} */ | |
225 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, | |
226 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){ | |
227 | |
228 int row=maxRow; | |
229 int col=maxCol; | |
230 int state=maxState; | |
231 | |
232 assert(maxState>=0 && maxState<packed.length) : | |
233 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); | |
234 assert(maxRow>=0 && maxRow<packed.length) : | |
235 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); | |
236 assert(maxCol>=0 && maxCol<packed[0].length) : | |
237 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); | |
238 | |
239 int score=packed[maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated | |
240 | |
241 if(row<rows){ | |
242 int difR=rows-row; | |
243 int difC=columns-col; | |
244 | |
245 while(difR>difC){ | |
246 score+=POINTSoff_NOREF; | |
247 difR--; | |
248 } | |
249 | |
250 row+=difR; | |
251 col+=difR; | |
252 | |
253 } | |
254 | |
255 assert(refStartLoc<=refEndLoc); | |
256 assert(row==rows); | |
257 | |
258 | |
259 final int bestRefStop=refStartLoc+col-1; | |
260 | |
261 while(row>0 && col>0){ | |
262 state=(packed[row][col]&MODEMASK); | |
263 if(state==MODE_MATCH){ | |
264 col--; | |
265 row--; | |
266 }else if(state==MODE_SUB){ | |
267 col--; | |
268 row--; | |
269 }else if(state==MODE_DEL){ | |
270 col--; | |
271 }else if(state==MODE_INS){ | |
272 row--; | |
273 }else{ | |
274 assert(false) : state; | |
275 } | |
276 } | |
277 // assert(false) : row+", "+col; | |
278 if(row>col){ | |
279 col-=row; | |
280 } | |
281 | |
282 final int bestRefStart=refStartLoc+col; | |
283 | |
284 score>>=SCOREOFFSET; | |
285 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart); | |
286 int[] rvec; | |
287 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow | |
288 int padLeft=Tools.max(0, refStartLoc-bestRefStart); | |
289 int padRight=Tools.max(0, bestRefStop-refEndLoc); | |
290 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; | |
291 }else{ | |
292 rvec=new int[] {score, bestRefStart, bestRefStop}; | |
293 } | |
294 return rvec; | |
295 } | |
296 | |
297 | |
298 /** Will not fill areas that cannot match minScore. | |
299 * @return {score, bestRefStart, bestRefStop} */ | |
300 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ | |
301 int a=Tools.max(0, refStartLoc); | |
302 int b=Tools.min(ref.length-1, refEndLoc); | |
303 assert(b>=a); | |
304 | |
305 if(b-a>=maxColumns){ | |
306 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); | |
307 assert(false) : refStartLoc+", "+refEndLoc; | |
308 b=Tools.min(ref.length-1, a+maxColumns-1); | |
309 } | |
310 int[] max=fillLimited(read, ref, a, b, minScore); | |
311 // return max==null ? null : new int[] {max[3], 0, max[1]}; | |
312 | |
313 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/)); | |
314 | |
315 return score; | |
316 } | |
317 | |
318 public static final String toString(byte[] ref, int startLoc, int stopLoc){ | |
319 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1); | |
320 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);} | |
321 return sb.toString(); | |
322 } | |
323 | |
324 public static int calcDelScore(int len){ | |
325 if(len<=0){return 0;} | |
326 int score=POINTS_DEL; | |
327 if(len>1){ | |
328 score+=(len-1)*POINTS_DEL2; | |
329 } | |
330 return score; | |
331 } | |
332 | |
333 private static int calcDelScoreOffset(int len){ | |
334 if(len<=0){return 0;} | |
335 int score=POINTSoff_DEL; | |
336 if(len>1){ | |
337 score+=(len-1)*POINTSoff_DEL2; | |
338 } | |
339 return score; | |
340 } | |
341 | |
342 public static int calcInsScore(int len){ | |
343 if(len<=0){return 0;} | |
344 int score=POINTS_INS; | |
345 | |
346 if(len>1){ | |
347 score+=(len-1)*POINTS_INS2; | |
348 } | |
349 return score; | |
350 } | |
351 | |
352 private static int calcInsScoreOffset(int len){ | |
353 if(len<=0){return 0;} | |
354 int score=POINTSoff_INS; | |
355 | |
356 if(len>1){ | |
357 score+=(len-1)*POINTSoff_INS2; | |
358 } | |
359 return score; | |
360 } | |
361 | |
362 | |
363 private final int maxRows; | |
364 public final int maxColumns; | |
365 | |
366 private final int[][] packed; | |
367 | |
368 private final int[] vertLimit; | |
369 private final int[] horizLimit; | |
370 | |
371 private final int[] insScoreArray; | |
372 private final int[] delScoreArray; | |
373 private final int[] matchScoreArray; | |
374 private final int[] subScoreArray; | |
375 | |
376 public static final int MODEBITS=3; | |
377 public static final int STARTBITS=9; | |
378 public static final int LOWBITS=MODEBITS+STARTBITS; | |
379 public static final int SCOREBITS=32-STARTBITS; | |
380 public static final int MAX_START=((1<<STARTBITS)-1); | |
381 public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000; | |
382 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD". | |
383 | |
384 // public static final int MODEOFFSET=0; //Always zero. | |
385 public static final int STARTOFFSET=MODEBITS; | |
386 public static final int SCOREOFFSET=LOWBITS; | |
387 | |
388 public static final int MODEMASK=~((-1)<<MODEBITS); | |
389 public static final int STARTMASK=(~((-1)<<STARTBITS))<<STARTOFFSET; | |
390 public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET; | |
391 public static final int HIGHMASK=SCOREMASK|STARTMASK; | |
392 | |
393 //For some reason changing MODE_DEL from 1 to 0 breaks everything | |
394 private static final byte MODE_DEL=1; | |
395 private static final byte MODE_INS=2; | |
396 private static final byte MODE_SUB=3; | |
397 private static final byte MODE_MATCH=4; | |
398 | |
399 public static final int POINTS_NOREF=-10; | |
400 public static final int POINTS_MATCH=90; | |
401 public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives | |
402 public static final int POINTS_SUB=-143; | |
403 public static final int POINTS_SUB2=-54; | |
404 public static final int POINTS_INS=-207; | |
405 public static final int POINTS_INS2=-51; | |
406 public static final int POINTS_INS3=-37; | |
407 public static final int POINTS_DEL=-273; | |
408 public static final int POINTS_DEL2=-38; | |
409 | |
410 | |
411 // public static final int POINTS_NOREF=-10; | |
412 // public static final int POINTS_NOCALL=-10; | |
413 // public static final int POINTS_MATCH=90; | |
414 // public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives | |
415 // public static final int POINTS_SUB=-143; | |
416 // public static final int POINTS_SUB2=-54; | |
417 // public static final int POINTS_INS=-207; | |
418 // public static final int POINTS_INS2=-51; | |
419 // public static final int POINTS_INS3=-37; | |
420 // public static final int POINTS_DEL=-273; | |
421 // public static final int POINTS_DEL2=-38; | |
422 | |
423 public static final int BAD=MIN_SCORE-1; | |
424 | |
425 | |
426 public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET); | |
427 public static final int POINTSoff_NOREF_MODE_SUB=POINTSoff_NOREF|MODE_SUB; | |
428 public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET); | |
429 public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET); | |
430 public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET); | |
431 public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET); | |
432 public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET); | |
433 public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET); | |
434 public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET); | |
435 public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET); | |
436 public static final int BADoff=(BAD<<SCOREOFFSET); | |
437 public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET; | |
438 public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET; | |
439 | |
440 private int rows; | |
441 private int columns; | |
442 | |
443 public long iterationsLimited=0; | |
444 public long iterationsUnlimited=0; | |
445 | |
446 public boolean verbose=false; | |
447 public boolean verbose2=false; | |
448 | |
449 } |