comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/Bio/pairwise2.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # Copyright 2002 by Jeffrey Chang.
2 # Copyright 2016, 2019, 2020 by Markus Piotrowski.
3 # All rights reserved.
4 #
5 # This file is part of the Biopython distribution and governed by your
6 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
7 # Please see the LICENSE file that should have been included as part of this
8 # package.
9
10 """Pairwise sequence alignment using a dynamic programming algorithm.
11
12 This provides functions to get global and local alignments between two
13 sequences. A global alignment finds the best concordance between all
14 characters in two sequences. A local alignment finds just the
15 subsequences that align the best. Local alignments must have a positive
16 score to be reported and they will not be extended for 'zero counting'
17 matches. This means a local alignment will always start and end with
18 a positive counting match.
19
20 When doing alignments, you can specify the match score and gap
21 penalties. The match score indicates the compatibility between an
22 alignment of two characters in the sequences. Highly compatible
23 characters should be given positive scores, and incompatible ones
24 should be given negative scores or 0. The gap penalties should be
25 negative.
26
27 The names of the alignment functions in this module follow the
28 convention
29 <alignment type>XX
30 where <alignment type> is either "global" or "local" and XX is a 2
31 character code indicating the parameters it takes. The first
32 character indicates the parameters for matches (and mismatches), and
33 the second indicates the parameters for gap penalties.
34
35 The match parameters are::
36
37 CODE DESCRIPTION & OPTIONAL KEYWORDS
38 x No parameters. Identical characters have score of 1, otherwise 0.
39 m A match score is the score of identical chars, otherwise mismatch
40 score. Keywords ``match``, ``mismatch``.
41 d A dictionary returns the score of any pair of characters.
42 Keyword ``match_dict``.
43 c A callback function returns scores. Keyword ``match_fn``.
44
45 The gap penalty parameters are::
46
47 CODE DESCRIPTION & OPTIONAL KEYWORDS
48 x No gap penalties.
49 s Same open and extend gap penalties for both sequences.
50 Keywords ``open``, ``extend``.
51 d The sequences have different open and extend gap penalties.
52 Keywords ``openA``, ``extendA``, ``openB``, ``extendB``.
53 c A callback function returns the gap penalties.
54 Keywords ``gap_A_fn``, ``gap_B_fn``.
55
56 All the different alignment functions are contained in an object
57 ``align``. For example:
58
59 >>> from Bio import pairwise2
60 >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG")
61
62 For better readability, the required arguments can be used with optional keywords:
63
64 >>> alignments = pairwise2.align.globalxx(sequenceA="ACCGT", sequenceB="ACG")
65
66 The result is a list of the alignments between the two strings. Each alignment
67 is a named tuple consisting of the two aligned sequences, the score and the
68 start and end positions of the alignment:
69
70 >>> print(alignments)
71 [Alignment(seqA='ACCGT', seqB='A-CG-', score=3.0, start=0, end=5), ...
72
73 You can access each element of an alignment by index or name:
74
75 >>> alignments[0][2]
76 3.0
77 >>> alignments[0].score
78 3.0
79
80 For a nice printout of an alignment, use the ``format_alignment`` method of
81 the module:
82
83 >>> from Bio.pairwise2 import format_alignment
84 >>> print(format_alignment(*alignments[0]))
85 ACCGT
86 | ||
87 A-CG-
88 Score=3
89 <BLANKLINE>
90
91 All alignment functions have the following arguments:
92
93 - Two sequences: strings, Biopython sequence objects or lists.
94 Lists are useful for supplying sequences which contain residues that are
95 encoded by more than one letter.
96
97 - ``penalize_extend_when_opening``: boolean (default: False).
98 Whether to count an extension penalty when opening a gap. If false, a gap of
99 1 is only penalized an "open" penalty, otherwise it is penalized
100 "open+extend".
101
102 - ``penalize_end_gaps``: boolean.
103 Whether to count the gaps at the ends of an alignment. By default, they are
104 counted for global alignments but not for local ones. Setting
105 ``penalize_end_gaps`` to (boolean, boolean) allows you to specify for the
106 two sequences separately whether gaps at the end of the alignment should be
107 counted.
108
109 - ``gap_char``: string (default: ``'-'``).
110 Which character to use as a gap character in the alignment returned. If your
111 input sequences are lists, you must change this to ``['-']``.
112
113 - ``force_generic``: boolean (default: False).
114 Always use the generic, non-cached, dynamic programming function (slow!).
115 For debugging.
116
117 - ``score_only``: boolean (default: False).
118 Only get the best score, don't recover any alignments. The return value of
119 the function is the score. Faster and uses less memory.
120
121 - ``one_alignment_only``: boolean (default: False).
122 Only recover one alignment.
123
124 The other parameters of the alignment function depend on the function called.
125 Some examples:
126
127 - Find the best global alignment between the two sequences. Identical
128 characters are given 1 point. No points are deducted for mismatches or gaps.
129
130 >>> for a in pairwise2.align.globalxx("ACCGT", "ACG"):
131 ... print(format_alignment(*a))
132 ACCGT
133 | ||
134 A-CG-
135 Score=3
136 <BLANKLINE>
137 ACCGT
138 || |
139 AC-G-
140 Score=3
141 <BLANKLINE>
142
143 - Same thing as before, but with a local alignment. Note that
144 ``format_alignment`` will only show the aligned parts of the sequences,
145 together with the starting positions.
146
147 >>> for a in pairwise2.align.localxx("ACCGT", "ACG"):
148 ... print(format_alignment(*a))
149 1 ACCG
150 | ||
151 1 A-CG
152 Score=3
153 <BLANKLINE>
154 1 ACCG
155 || |
156 1 AC-G
157 Score=3
158 <BLANKLINE>
159
160 To restore the 'historic' behaviour of ``format_alignemt``, i.e., showing
161 also the un-aligned parts of both sequences, use the new keyword parameter
162 ``full_sequences``:
163
164 >>> for a in pairwise2.align.localxx("ACCGT", "ACG"):
165 ... print(format_alignment(*a, full_sequences=True))
166 ACCGT
167 | ||
168 A-CG-
169 Score=3
170 <BLANKLINE>
171 ACCGT
172 || |
173 AC-G-
174 Score=3
175 <BLANKLINE>
176
177
178 - Do a global alignment. Identical characters are given 2 points, 1 point is
179 deducted for each non-identical character. Don't penalize gaps.
180
181 >>> for a in pairwise2.align.globalmx("ACCGT", "ACG", 2, -1):
182 ... print(format_alignment(*a))
183 ACCGT
184 | ||
185 A-CG-
186 Score=6
187 <BLANKLINE>
188 ACCGT
189 || |
190 AC-G-
191 Score=6
192 <BLANKLINE>
193
194 - Same as above, except now 0.5 points are deducted when opening a gap, and
195 0.1 points are deducted when extending it.
196
197 >>> for a in pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1):
198 ... print(format_alignment(*a))
199 ACCGT
200 | ||
201 A-CG-
202 Score=5
203 <BLANKLINE>
204 ACCGT
205 || |
206 AC-G-
207 Score=5
208 <BLANKLINE>
209
210 - Note that you can use keywords to increase the readability, e.g.:
211
212 >>> a = pairwise2.align.globalms("ACGT", "ACG", match=2, mismatch=-1, open=-.5,
213 ... extend=-.1)
214
215 - Depending on the penalties, a gap in one sequence may be followed by a gap in
216 the other sequence.If you don't like this behaviour, increase the gap-open
217 penalty:
218
219 >>> for a in pairwise2.align.globalms("A", "T", 5, -4, -1, -.1):
220 ... print(format_alignment(*a))
221 A-
222 <BLANKLINE>
223 -T
224 Score=-2
225 <BLANKLINE>
226 >>> for a in pairwise2.align.globalms("A", "T", 5, -4, -3, -.1):
227 ... print(format_alignment(*a))
228 A
229 .
230 T
231 Score=-4
232 <BLANKLINE>
233
234 - The alignment function can also use known matrices already included in
235 Biopython (in ``Bio.Align.substitution_matrices``):
236
237 >>> from Bio.Align import substitution_matrices
238 >>> matrix = substitution_matrices.load("BLOSUM62")
239 >>> for a in pairwise2.align.globaldx("KEVLA", "EVL", matrix):
240 ... print(format_alignment(*a))
241 KEVLA
242 |||
243 -EVL-
244 Score=13
245 <BLANKLINE>
246
247 - With the parameter ``c`` you can define your own match- and gap functions.
248 E.g. to define an affine logarithmic gap function and using it:
249
250 >>> from math import log
251 >>> def gap_function(x, y): # x is gap position in seq, y is gap length
252 ... if y == 0: # No gap
253 ... return 0
254 ... elif y == 1: # Gap open penalty
255 ... return -2
256 ... return - (2 + y/4.0 + log(y)/2.0)
257 ...
258 >>> alignment = pairwise2.align.globalmc("ACCCCCGT", "ACG", 5, -4,
259 ... gap_function, gap_function)
260
261 You can define different gap functions for each sequence.
262 Self-defined match functions must take the two residues to be compared and
263 return a score.
264
265 To see a description of the parameters for a function, please look at
266 the docstring for the function via the help function, e.g.
267 type ``help(pairwise2.align.localds)`` at the Python prompt.
268
269 """ # noqa: W291
270
271 import warnings
272 from collections import namedtuple
273
274 from Bio import BiopythonWarning
275 from Bio import BiopythonDeprecationWarning
276 from Bio.Align import substitution_matrices
277
278 warnings.warn(
279 "Bio.pairwise2 has been deprecated, and we intend to remove it in a "
280 "future release of Biopython. As an alternative, please consider using "
281 "Bio.Align.PairwiseAligner as a replacement, and contact the "
282 "Biopython developers if you still need the Bio.pairwise2 module.",
283 BiopythonDeprecationWarning,
284 )
285
286
287 MAX_ALIGNMENTS = 1000 # maximum alignments recovered in traceback
288
289 Alignment = namedtuple("Alignment", ("seqA, seqB, score, start, end"))
290
291
292 class align:
293 """Provide functions that do alignments.
294
295 Alignment functions are called as:
296
297 pairwise2.align.globalXX
298
299 or
300
301 pairwise2.align.localXX
302
303 Where XX is a 2 character code indicating the match/mismatch parameters
304 (first character, either x, m, d or c) and the gap penalty parameters
305 (second character, either x, s, d, or c).
306
307 For a detailed description read the main module's docstring (e.g.,
308 type ``help(pairwise2)``).
309 To see a description of the parameters for a function, please
310 look at the docstring for the function, e.g. type
311 ``help(pairwise2.align.localds)`` at the Python prompt.
312 """
313
314 class alignment_function:
315 """Callable class which impersonates an alignment function.
316
317 The constructor takes the name of the function. This class
318 will decode the name of the function to figure out how to
319 interpret the parameters.
320 """
321
322 # match code -> tuple of (parameters, docstring)
323 match2args = {
324 "x": ([], ""),
325 "m": (
326 ["match", "mismatch"],
327 "match is the score to given to identical characters.\n"
328 "mismatch is the score given to non-identical ones.",
329 ),
330 "d": (
331 ["match_dict"],
332 "match_dict is a dictionary where the keys are tuples\n"
333 "of pairs of characters and the values are the scores,\n"
334 "e.g. ('A', 'C') : 2.5.",
335 ),
336 "c": (
337 ["match_fn"],
338 "match_fn is a callback function that takes two "
339 "characters and returns the score between them.",
340 ),
341 }
342 # penalty code -> tuple of (parameters, docstring)
343 penalty2args = {
344 "x": ([], ""),
345 "s": (
346 ["open", "extend"],
347 "open and extend are the gap penalties when a gap is\n"
348 "opened and extended. They should be negative.",
349 ),
350 "d": (
351 ["openA", "extendA", "openB", "extendB"],
352 "openA and extendA are the gap penalties for sequenceA,\n"
353 "and openB and extendB for sequenceB. The penalties\n"
354 "should be negative.",
355 ),
356 "c": (
357 ["gap_A_fn", "gap_B_fn"],
358 "gap_A_fn and gap_B_fn are callback functions that takes\n"
359 "(1) the index where the gap is opened, and (2) the length\n"
360 "of the gap. They should return a gap penalty.",
361 ),
362 }
363
364 def __init__(self, name):
365 """Check to make sure the name of the function is reasonable."""
366 if name.startswith("global"):
367 if len(name) != 8:
368 raise AttributeError("function should be globalXX")
369 elif name.startswith("local"):
370 if len(name) != 7:
371 raise AttributeError("function should be localXX")
372 else:
373 raise AttributeError(name)
374 align_type, match_type, penalty_type = name[:-2], name[-2], name[-1]
375 try:
376 match_args, match_doc = self.match2args[match_type]
377 except KeyError:
378 raise AttributeError(f"unknown match type {match_type!r}")
379 try:
380 penalty_args, penalty_doc = self.penalty2args[penalty_type]
381 except KeyError:
382 raise AttributeError(f"unknown penalty type {penalty_type!r}")
383
384 # Now get the names of the parameters to this function.
385 param_names = ["sequenceA", "sequenceB"]
386 param_names.extend(match_args)
387 param_names.extend(penalty_args)
388 self.function_name = name
389 self.align_type = align_type
390 self.param_names = param_names
391
392 self.__name__ = self.function_name
393 # Set the doc string.
394 doc = f"{self.__name__}({', '.join(self.param_names)}) -> alignments\n"
395 doc += """\
396 \nThe following parameters can also be used with optional
397 keywords of the same name.\n\n
398 sequenceA and sequenceB must be of the same type, either
399 strings, lists or Biopython sequence objects.\n
400 """
401 if match_doc:
402 doc += f"\n{match_doc}\n"
403 if penalty_doc:
404 doc += f"\n{penalty_doc}\n"
405 doc += """\
406 \nalignments is a list of named tuples (seqA, seqB, score,
407 begin, end). seqA and seqB are strings showing the alignment
408 between the sequences. score is the score of the alignment.
409 begin and end are indexes of seqA and seqB that indicate
410 where the alignment occurs.
411 """
412 self.__doc__ = doc
413
414 def decode(self, *args, **keywds):
415 """Decode the arguments for the _align function.
416
417 keywds will get passed to it, so translate the arguments
418 to this function into forms appropriate for _align.
419 """
420 keywds = keywds.copy()
421
422 # Replace possible "keywords" with arguments:
423 args += (len(self.param_names) - len(args)) * (None,)
424 for key in keywds.copy():
425 if key in self.param_names:
426 _index = self.param_names.index(key)
427 args = args[:_index] + (keywds[key],) + args[_index:]
428 del keywds[key]
429 args = tuple(arg for arg in args if arg is not None)
430
431 if len(args) != len(self.param_names):
432 raise TypeError(
433 "%s takes exactly %d argument (%d given)"
434 % (self.function_name, len(self.param_names), len(args))
435 )
436
437 i = 0
438 while i < len(self.param_names):
439 if self.param_names[i] in [
440 "sequenceA",
441 "sequenceB",
442 "gap_A_fn",
443 "gap_B_fn",
444 "match_fn",
445 ]:
446 keywds[self.param_names[i]] = args[i]
447 i += 1
448 elif self.param_names[i] == "match":
449 assert self.param_names[i + 1] == "mismatch"
450 match, mismatch = args[i], args[i + 1]
451 keywds["match_fn"] = identity_match(match, mismatch)
452 i += 2
453 elif self.param_names[i] == "match_dict":
454 keywds["match_fn"] = dictionary_match(args[i])
455 i += 1
456 elif self.param_names[i] == "open":
457 assert self.param_names[i + 1] == "extend"
458 open, extend = args[i], args[i + 1]
459 pe = keywds.get("penalize_extend_when_opening", 0)
460 keywds["gap_A_fn"] = affine_penalty(open, extend, pe)
461 keywds["gap_B_fn"] = affine_penalty(open, extend, pe)
462 i += 2
463 elif self.param_names[i] == "openA":
464 assert self.param_names[i + 3] == "extendB"
465 openA, extendA, openB, extendB = args[i : i + 4]
466 pe = keywds.get("penalize_extend_when_opening", 0)
467 keywds["gap_A_fn"] = affine_penalty(openA, extendA, pe)
468 keywds["gap_B_fn"] = affine_penalty(openB, extendB, pe)
469 i += 4
470 else:
471 raise ValueError(f"unknown parameter {self.param_names[i]!r}")
472
473 # Here are the default parameters for _align. Assign
474 # these to keywds, unless already specified.
475 pe = keywds.get("penalize_extend_when_opening", 0)
476 default_params = [
477 ("match_fn", identity_match(1, 0)),
478 ("gap_A_fn", affine_penalty(0, 0, pe)),
479 ("gap_B_fn", affine_penalty(0, 0, pe)),
480 ("penalize_extend_when_opening", 0),
481 ("penalize_end_gaps", self.align_type == "global"),
482 ("align_globally", self.align_type == "global"),
483 ("gap_char", "-"),
484 ("force_generic", 0),
485 ("score_only", 0),
486 ("one_alignment_only", 0),
487 ]
488 for name, default in default_params:
489 keywds[name] = keywds.get(name, default)
490 value = keywds["penalize_end_gaps"]
491 try:
492 n = len(value)
493 except TypeError:
494 keywds["penalize_end_gaps"] = tuple([value] * 2)
495 else:
496 assert n == 2
497 return keywds
498
499 def __call__(self, *args, **keywds):
500 """Call the alignment instance already created."""
501 keywds = self.decode(*args, **keywds)
502 return _align(**keywds)
503
504 def __getattr__(self, attr):
505 """Call alignment_function() to check and decode the attributes."""
506 # The following 'magic' is needed to rewrite the class docstring
507 # dynamically:
508 wrapper = self.alignment_function(attr)
509 wrapper_type = type(wrapper)
510 wrapper_dict = wrapper_type.__dict__.copy()
511 wrapper_dict["__doc__"] = wrapper.__doc__
512 new_alignment_function = type("alignment_function", (object,), wrapper_dict)
513
514 return new_alignment_function(attr)
515
516
517 align = align()
518
519
520 def _align(
521 sequenceA,
522 sequenceB,
523 match_fn,
524 gap_A_fn,
525 gap_B_fn,
526 penalize_extend_when_opening,
527 penalize_end_gaps,
528 align_globally,
529 gap_char,
530 force_generic,
531 score_only,
532 one_alignment_only,
533 ):
534 """Return optimal alignments between two sequences (PRIVATE).
535
536 This method either returns a list of optimal alignments (with the same
537 score) or just the optimal score.
538 """
539 if not sequenceA or not sequenceB:
540 return []
541 try:
542 sequenceA + gap_char
543 sequenceB + gap_char
544 except TypeError:
545 raise TypeError(
546 "both sequences must be of the same type, either "
547 "string/sequence object or list. Gap character must "
548 "fit the sequence type (string or list)"
549 )
550
551 if not isinstance(sequenceA, list):
552 sequenceA = str(sequenceA)
553 if not isinstance(sequenceB, list):
554 sequenceB = str(sequenceB)
555 if not align_globally and (penalize_end_gaps[0] or penalize_end_gaps[1]):
556 warnings.warn(
557 '"penalize_end_gaps" should not be used in local '
558 "alignments. The resulting score may be wrong.",
559 BiopythonWarning,
560 )
561
562 if (
563 (not force_generic)
564 and isinstance(gap_A_fn, affine_penalty)
565 and isinstance(gap_B_fn, affine_penalty)
566 ):
567 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
568 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
569 matrices = _make_score_matrix_fast(
570 sequenceA,
571 sequenceB,
572 match_fn,
573 open_A,
574 extend_A,
575 open_B,
576 extend_B,
577 penalize_extend_when_opening,
578 penalize_end_gaps,
579 align_globally,
580 score_only,
581 )
582 else:
583 matrices = _make_score_matrix_generic(
584 sequenceA,
585 sequenceB,
586 match_fn,
587 gap_A_fn,
588 gap_B_fn,
589 penalize_end_gaps,
590 align_globally,
591 score_only,
592 )
593
594 score_matrix, trace_matrix, best_score = matrices
595
596 # print("SCORE %s" % print_matrix(score_matrix))
597 # print("TRACEBACK %s" % print_matrix(trace_matrix))
598
599 # If they only want the score, then return it.
600 if score_only:
601 return best_score
602
603 starts = _find_start(score_matrix, best_score, align_globally)
604
605 # Recover the alignments and return them.
606 alignments = _recover_alignments(
607 sequenceA,
608 sequenceB,
609 starts,
610 best_score,
611 score_matrix,
612 trace_matrix,
613 align_globally,
614 gap_char,
615 one_alignment_only,
616 gap_A_fn,
617 gap_B_fn,
618 )
619 if not alignments:
620 # This may happen, see recover_alignments for explanation
621 score_matrix, trace_matrix = _reverse_matrices(score_matrix, trace_matrix)
622 starts = [(z, (y, x)) for z, (x, y) in starts]
623 alignments = _recover_alignments(
624 sequenceB,
625 sequenceA,
626 starts,
627 best_score,
628 score_matrix,
629 trace_matrix,
630 align_globally,
631 gap_char,
632 one_alignment_only,
633 gap_B_fn,
634 gap_A_fn,
635 reverse=True,
636 )
637 return alignments
638
639
640 def _make_score_matrix_generic(
641 sequenceA,
642 sequenceB,
643 match_fn,
644 gap_A_fn,
645 gap_B_fn,
646 penalize_end_gaps,
647 align_globally,
648 score_only,
649 ):
650 """Generate a score and traceback matrix (PRIVATE).
651
652 This implementation according to Needleman-Wunsch allows the usage of
653 general gap functions and is rather slow. It is automatically called if
654 you define your own gap functions. You can force the usage of this method
655 with ``force_generic=True``.
656 """
657 local_max_score = 0
658 # Create the score and traceback matrices. These should be in the
659 # shape:
660 # sequenceA (down) x sequenceB (across)
661 lenA, lenB = len(sequenceA), len(sequenceB)
662 score_matrix, trace_matrix = [], []
663 for i in range(lenA + 1):
664 score_matrix.append([None] * (lenB + 1))
665 if not score_only:
666 trace_matrix.append([None] * (lenB + 1))
667
668 # Initialize first row and column with gap scores. This is like opening up
669 # i gaps at the beginning of sequence A or B.
670 for i in range(lenA + 1):
671 if penalize_end_gaps[1]: # [1]:gap in sequence B
672 score = gap_B_fn(0, i)
673 else:
674 score = 0.0
675 score_matrix[i][0] = score
676 for i in range(lenB + 1):
677 if penalize_end_gaps[0]: # [0]:gap in sequence A
678 score = gap_A_fn(0, i)
679 else:
680 score = 0.0
681 score_matrix[0][i] = score
682
683 # Fill in the score matrix. Each position in the matrix
684 # represents an alignment between a character from sequence A to
685 # one in sequence B. As I iterate through the matrix, find the
686 # alignment by choose the best of:
687 # 1) extending a previous alignment without gaps
688 # 2) adding a gap in sequenceA
689 # 3) adding a gap in sequenceB
690 for row in range(1, lenA + 1):
691 for col in range(1, lenB + 1):
692 # First, calculate the score that would occur by extending
693 # the alignment without gaps.
694 # fmt: off
695 nogap_score = (
696 score_matrix[row - 1][col - 1]
697 + match_fn(sequenceA[row - 1], sequenceB[col - 1])
698 )
699
700 # fmt: on
701 # Try to find a better score by opening gaps in sequenceA.
702 # Do this by checking alignments from each column in the row.
703 # Each column represents a different character to align from,
704 # and thus a different length gap.
705 # Although the gap function does not distinguish between opening
706 # and extending a gap, we distinguish them for the backtrace.
707 if not penalize_end_gaps[0] and row == lenA:
708 row_open = score_matrix[row][col - 1]
709 row_extend = max(score_matrix[row][x] for x in range(col))
710 else:
711 row_open = score_matrix[row][col - 1] + gap_A_fn(row, 1)
712 row_extend = max(
713 score_matrix[row][x] + gap_A_fn(row, col - x) for x in range(col)
714 )
715
716 # Try to find a better score by opening gaps in sequenceB.
717 if not penalize_end_gaps[1] and col == lenB:
718 col_open = score_matrix[row - 1][col]
719 col_extend = max(score_matrix[x][col] for x in range(row))
720 else:
721 col_open = score_matrix[row - 1][col] + gap_B_fn(col, 1)
722 col_extend = max(
723 score_matrix[x][col] + gap_B_fn(col, row - x) for x in range(row)
724 )
725
726 best_score = max(nogap_score, row_open, row_extend, col_open, col_extend)
727 local_max_score = max(local_max_score, best_score)
728 if not align_globally and best_score < 0:
729 score_matrix[row][col] = 0.0
730 else:
731 score_matrix[row][col] = best_score
732
733 # The backtrace is encoded binary. See _make_score_matrix_fast
734 # for details.
735 if not score_only:
736 trace_score = 0
737 if rint(nogap_score) == rint(best_score):
738 trace_score += 2
739 if rint(row_open) == rint(best_score):
740 trace_score += 1
741 if rint(row_extend) == rint(best_score):
742 trace_score += 8
743 if rint(col_open) == rint(best_score):
744 trace_score += 4
745 if rint(col_extend) == rint(best_score):
746 trace_score += 16
747 trace_matrix[row][col] = trace_score
748
749 if not align_globally:
750 best_score = local_max_score
751
752 return score_matrix, trace_matrix, best_score
753
754
755 def _make_score_matrix_fast(
756 sequenceA,
757 sequenceB,
758 match_fn,
759 open_A,
760 extend_A,
761 open_B,
762 extend_B,
763 penalize_extend_when_opening,
764 penalize_end_gaps,
765 align_globally,
766 score_only,
767 ):
768 """Generate a score and traceback matrix according to Gotoh (PRIVATE).
769
770 This is an implementation of the Needleman-Wunsch dynamic programming
771 algorithm as modified by Gotoh, implementing affine gap penalties.
772 In short, we have three matrices, holding scores for alignments ending
773 in (1) a match/mismatch, (2) a gap in sequence A, and (3) a gap in
774 sequence B, respectively. However, we can combine them in one matrix,
775 which holds the best scores, and store only those values from the
776 other matrices that are actually used for the next step of calculation.
777 The traceback matrix holds the positions for backtracing the alignment.
778 """
779 first_A_gap = calc_affine_penalty(1, open_A, extend_A, penalize_extend_when_opening)
780 first_B_gap = calc_affine_penalty(1, open_B, extend_B, penalize_extend_when_opening)
781 local_max_score = 0
782
783 # Create the score and traceback matrices. These should be in the
784 # shape:
785 # sequenceA (down) x sequenceB (across)
786 lenA, lenB = len(sequenceA), len(sequenceB)
787 score_matrix, trace_matrix = [], []
788 for i in range(lenA + 1):
789 score_matrix.append([None] * (lenB + 1))
790 if not score_only:
791 trace_matrix.append([None] * (lenB + 1))
792
793 # Initialize first row and column with gap scores. This is like opening up
794 # i gaps at the beginning of sequence A or B.
795 for i in range(lenA + 1):
796 if penalize_end_gaps[1]: # [1]:gap in sequence B
797 score = calc_affine_penalty(
798 i, open_B, extend_B, penalize_extend_when_opening
799 )
800 else:
801 score = 0
802 score_matrix[i][0] = score
803 for i in range(lenB + 1):
804 if penalize_end_gaps[0]: # [0]:gap in sequence A
805 score = calc_affine_penalty(
806 i, open_A, extend_A, penalize_extend_when_opening
807 )
808 else:
809 score = 0
810 score_matrix[0][i] = score
811
812 # Now initialize the col 'matrix'. Actually this is only a one dimensional
813 # list, since we only need the col scores from the last row.
814 col_score = [0] # Best score, if actual alignment ends with gap in seqB
815 for i in range(1, lenB + 1):
816 col_score.append(
817 calc_affine_penalty(i, 2 * open_B, extend_B, penalize_extend_when_opening)
818 )
819
820 # The row 'matrix' is calculated on the fly. Here we only need the actual
821 # score.
822 # Now, filling up the score and traceback matrices:
823 for row in range(1, lenA + 1):
824 row_score = calc_affine_penalty(
825 row, 2 * open_A, extend_A, penalize_extend_when_opening
826 )
827 for col in range(1, lenB + 1):
828 # Calculate the score that would occur by extending the
829 # alignment without gaps.
830 # fmt: off
831 nogap_score = (
832 score_matrix[row - 1][col - 1]
833 + match_fn(sequenceA[row - 1], sequenceB[col - 1])
834 )
835 # fmt: on
836 # Check the score that would occur if there were a gap in
837 # sequence A. This could come from opening a new gap or
838 # extending an existing one.
839 # A gap in sequence A can also be opened if it follows a gap in
840 # sequence B: A-
841 # -B
842 if not penalize_end_gaps[0] and row == lenA:
843 row_open = score_matrix[row][col - 1]
844 row_extend = row_score
845 else:
846 row_open = score_matrix[row][col - 1] + first_A_gap
847 row_extend = row_score + extend_A
848 row_score = max(row_open, row_extend)
849
850 # The same for sequence B:
851 if not penalize_end_gaps[1] and col == lenB:
852 col_open = score_matrix[row - 1][col]
853 col_extend = col_score[col]
854 else:
855 col_open = score_matrix[row - 1][col] + first_B_gap
856 col_extend = col_score[col] + extend_B
857 col_score[col] = max(col_open, col_extend)
858
859 best_score = max(nogap_score, col_score[col], row_score)
860 local_max_score = max(local_max_score, best_score)
861 if not align_globally and best_score < 0:
862 score_matrix[row][col] = 0
863 else:
864 score_matrix[row][col] = best_score
865
866 # Now the trace_matrix. The edges of the backtrace are encoded
867 # binary: 1 = open gap in seqA, 2 = match/mismatch of seqA and
868 # seqB, 4 = open gap in seqB, 8 = extend gap in seqA, and
869 # 16 = extend gap in seqB. This values can be summed up.
870 # Thus, the trace score 7 means that the best score can either
871 # come from opening a gap in seqA (=1), pairing two characters
872 # of seqA and seqB (+2=3) or opening a gap in seqB (+4=7).
873 # However, if we only want the score we don't care about the trace.
874 if not score_only:
875 row_score_rint = rint(row_score)
876 col_score_rint = rint(col_score[col])
877 row_trace_score = 0
878 col_trace_score = 0
879 if rint(row_open) == row_score_rint:
880 row_trace_score += 1 # Open gap in seqA
881 if rint(row_extend) == row_score_rint:
882 row_trace_score += 8 # Extend gap in seqA
883 if rint(col_open) == col_score_rint:
884 col_trace_score += 4 # Open gap in seqB
885 if rint(col_extend) == col_score_rint:
886 col_trace_score += 16 # Extend gap in seqB
887
888 trace_score = 0
889 best_score_rint = rint(best_score)
890 if rint(nogap_score) == best_score_rint:
891 trace_score += 2 # Align seqA with seqB
892 if row_score_rint == best_score_rint:
893 trace_score += row_trace_score
894 if col_score_rint == best_score_rint:
895 trace_score += col_trace_score
896 trace_matrix[row][col] = trace_score
897
898 if not align_globally:
899 best_score = local_max_score
900
901 return score_matrix, trace_matrix, best_score
902
903
904 def _recover_alignments(
905 sequenceA,
906 sequenceB,
907 starts,
908 best_score,
909 score_matrix,
910 trace_matrix,
911 align_globally,
912 gap_char,
913 one_alignment_only,
914 gap_A_fn,
915 gap_B_fn,
916 reverse=False,
917 ):
918 """Do the backtracing and return a list of alignments (PRIVATE).
919
920 Recover the alignments by following the traceback matrix. This
921 is a recursive procedure, but it's implemented here iteratively
922 with a stack.
923
924 sequenceA and sequenceB may be sequences, including strings,
925 lists, or list-like objects. In order to preserve the type of
926 the object, we need to use slices on the sequences instead of
927 indexes. For example, sequenceA[row] may return a type that's
928 not compatible with sequenceA, e.g. if sequenceA is a list and
929 sequenceA[row] is a string. Thus, avoid using indexes and use
930 slices, e.g. sequenceA[row:row+1]. Assume that client-defined
931 sequence classes preserve these semantics.
932 """
933 lenA, lenB = len(sequenceA), len(sequenceB)
934 ali_seqA, ali_seqB = sequenceA[0:0], sequenceB[0:0]
935 tracebacks = []
936 in_process = []
937
938 for start in starts:
939 score, (row, col) = start
940 begin = 0
941 if align_globally:
942 end = None
943 else:
944 # If this start is a zero-extension: don't start here!
945 if (score, (row - 1, col - 1)) in starts:
946 continue
947 # Local alignments should start with a positive score!
948 if score <= 0:
949 continue
950 # Local alignments should not end with a gap!:
951 trace = trace_matrix[row][col]
952 if (trace - trace % 2) % 4 == 2: # Trace contains 'nogap', fine!
953 trace_matrix[row][col] = 2
954 # If not, don't start here!
955 else:
956 continue
957 end = -max(lenA - row, lenB - col)
958 if not end:
959 end = None
960 col_distance = lenB - col
961 row_distance = lenA - row
962
963 # fmt: off
964 ali_seqA = (
965 (col_distance - row_distance) * gap_char
966 + sequenceA[lenA - 1 : row - 1 : -1]
967 )
968 ali_seqB = (
969 (row_distance - col_distance) * gap_char
970 + sequenceB[lenB - 1 : col - 1 : -1]
971 )
972 # fmt: on
973 in_process += [
974 (ali_seqA, ali_seqB, end, row, col, False, trace_matrix[row][col])
975 ]
976 while in_process and len(tracebacks) < MAX_ALIGNMENTS:
977 # Although we allow a gap in seqB to be followed by a gap in seqA,
978 # we don't want to allow it the other way round, since this would
979 # give redundant alignments of type: A- vs. -A
980 # -B B-
981 # Thus we need to keep track if a gap in seqA was opened (col_gap)
982 # and stop the backtrace (dead_end) if a gap in seqB follows.
983 #
984 # Attention: This may fail, if the gap-penalties for both strands are
985 # different. In this case the second alignment may be the only optimal
986 # alignment. Thus it can happen that no alignment is returned. For
987 # this case a workaround was implemented, which reverses the input and
988 # the matrices (this happens in _reverse_matrices) and repeats the
989 # backtrace. The variable 'reverse' keeps track of this.
990 dead_end = False
991 ali_seqA, ali_seqB, end, row, col, col_gap, trace = in_process.pop()
992
993 while (row > 0 or col > 0) and not dead_end:
994 cache = (ali_seqA[:], ali_seqB[:], end, row, col, col_gap)
995
996 # If trace is empty we have reached at least one border of the
997 # matrix or the end of a local alignment. Just add the rest of
998 # the sequence(s) and fill with gaps if necessary.
999 if not trace:
1000 if col and col_gap:
1001 dead_end = True
1002 else:
1003 ali_seqA, ali_seqB = _finish_backtrace(
1004 sequenceA, sequenceB, ali_seqA, ali_seqB, row, col, gap_char
1005 )
1006 break
1007 elif trace % 2 == 1: # = row open = open gap in seqA
1008 trace -= 1
1009 if col_gap:
1010 dead_end = True
1011 else:
1012 col -= 1
1013 ali_seqA += gap_char
1014 ali_seqB += sequenceB[col : col + 1]
1015 col_gap = False
1016 elif trace % 4 == 2: # = match/mismatch of seqA with seqB
1017 trace -= 2
1018 row -= 1
1019 col -= 1
1020 ali_seqA += sequenceA[row : row + 1]
1021 ali_seqB += sequenceB[col : col + 1]
1022 col_gap = False
1023 elif trace % 8 == 4: # = col open = open gap in seqB
1024 trace -= 4
1025 row -= 1
1026 ali_seqA += sequenceA[row : row + 1]
1027 ali_seqB += gap_char
1028 col_gap = True
1029 elif trace in (8, 24): # = row extend = extend gap in seqA
1030 trace -= 8
1031 if col_gap:
1032 dead_end = True
1033 else:
1034 col_gap = False
1035 # We need to find the starting point of the extended gap
1036 x = _find_gap_open(
1037 sequenceA,
1038 sequenceB,
1039 ali_seqA,
1040 ali_seqB,
1041 end,
1042 row,
1043 col,
1044 col_gap,
1045 gap_char,
1046 score_matrix,
1047 trace_matrix,
1048 in_process,
1049 gap_A_fn,
1050 col,
1051 row,
1052 "col",
1053 best_score,
1054 align_globally,
1055 )
1056 ali_seqA, ali_seqB, row, col, in_process, dead_end = x
1057 elif trace == 16: # = col extend = extend gap in seqB
1058 trace -= 16
1059 col_gap = True
1060 x = _find_gap_open(
1061 sequenceA,
1062 sequenceB,
1063 ali_seqA,
1064 ali_seqB,
1065 end,
1066 row,
1067 col,
1068 col_gap,
1069 gap_char,
1070 score_matrix,
1071 trace_matrix,
1072 in_process,
1073 gap_B_fn,
1074 row,
1075 col,
1076 "row",
1077 best_score,
1078 align_globally,
1079 )
1080 ali_seqA, ali_seqB, row, col, in_process, dead_end = x
1081
1082 if trace: # There is another path to follow...
1083 cache += (trace,)
1084 in_process.append(cache)
1085 trace = trace_matrix[row][col]
1086 if not align_globally:
1087 if score_matrix[row][col] == best_score:
1088 # We have gone through a 'zero-score' extension, discard it
1089 dead_end = True
1090 elif score_matrix[row][col] <= 0:
1091 # We have reached the end of the backtrace
1092 begin = max(row, col)
1093 trace = 0
1094 if not dead_end:
1095 if not reverse:
1096 tracebacks.append((ali_seqA[::-1], ali_seqB[::-1], score, begin, end))
1097 else:
1098 tracebacks.append((ali_seqB[::-1], ali_seqA[::-1], score, begin, end))
1099 if one_alignment_only:
1100 break
1101 return _clean_alignments(tracebacks)
1102
1103
1104 def _find_start(score_matrix, best_score, align_globally):
1105 """Return a list of starting points (score, (row, col)) (PRIVATE).
1106
1107 Indicating every possible place to start the tracebacks.
1108 """
1109 nrows, ncols = len(score_matrix), len(score_matrix[0])
1110 # In this implementation of the global algorithm, the start will always be
1111 # the bottom right corner of the matrix.
1112 if align_globally:
1113 starts = [(best_score, (nrows - 1, ncols - 1))]
1114 else:
1115 # For local alignments, there may be many different start points.
1116 starts = []
1117 tolerance = 0 # XXX do anything with this?
1118 # Now find all the positions within some tolerance of the best
1119 # score.
1120 for row in range(nrows):
1121 for col in range(ncols):
1122 score = score_matrix[row][col]
1123 if rint(abs(score - best_score)) <= rint(tolerance):
1124 starts.append((score, (row, col)))
1125 return starts
1126
1127
1128 def _reverse_matrices(score_matrix, trace_matrix):
1129 """Reverse score and trace matrices (PRIVATE)."""
1130 reverse_score_matrix = []
1131 reverse_trace_matrix = []
1132 # fmt: off
1133 reverse_trace = {
1134 1: 4, 2: 2, 3: 6, 4: 1, 5: 5, 6: 3, 7: 7, 8: 16, 9: 20, 10: 18, 11: 22, 12: 17,
1135 13: 21, 14: 19, 15: 23, 16: 8, 17: 12, 18: 10, 19: 14, 20: 9, 21: 13, 22: 11,
1136 23: 15, 24: 24, 25: 28, 26: 26, 27: 30, 28: 25, 29: 29, 30: 27, 31: 31,
1137 None: None,
1138 }
1139 # fmt: on
1140 for col in range(len(score_matrix[0])):
1141 new_score_row = []
1142 new_trace_row = []
1143 for row in range(len(score_matrix)):
1144 new_score_row.append(score_matrix[row][col])
1145 new_trace_row.append(reverse_trace[trace_matrix[row][col]])
1146 reverse_score_matrix.append(new_score_row)
1147 reverse_trace_matrix.append(new_trace_row)
1148 return reverse_score_matrix, reverse_trace_matrix
1149
1150
1151 def _clean_alignments(alignments):
1152 """Take a list of alignments and return a cleaned version (PRIVATE).
1153
1154 Remove duplicates, make sure begin and end are set correctly, remove
1155 empty alignments.
1156 """
1157 unique_alignments = []
1158 for align in alignments:
1159 if align not in unique_alignments:
1160 unique_alignments.append(align)
1161 i = 0
1162 while i < len(unique_alignments):
1163 seqA, seqB, score, begin, end = unique_alignments[i]
1164 # Make sure end is set reasonably.
1165 if end is None: # global alignment
1166 end = len(seqA)
1167 elif end < 0:
1168 end = end + len(seqA)
1169 # If there's no alignment here, get rid of it.
1170 if begin >= end:
1171 del unique_alignments[i]
1172 continue
1173 unique_alignments[i] = Alignment(seqA, seqB, score, begin, end)
1174 i += 1
1175 return unique_alignments
1176
1177
1178 def _finish_backtrace(sequenceA, sequenceB, ali_seqA, ali_seqB, row, col, gap_char):
1179 """Add remaining sequences and fill with gaps if necessary (PRIVATE)."""
1180 if row:
1181 ali_seqA += sequenceA[row - 1 :: -1]
1182 if col:
1183 ali_seqB += sequenceB[col - 1 :: -1]
1184 if row > col:
1185 ali_seqB += gap_char * (len(ali_seqA) - len(ali_seqB))
1186 elif col > row:
1187 ali_seqA += gap_char * (len(ali_seqB) - len(ali_seqA))
1188 return ali_seqA, ali_seqB
1189
1190
1191 def _find_gap_open(
1192 sequenceA,
1193 sequenceB,
1194 ali_seqA,
1195 ali_seqB,
1196 end,
1197 row,
1198 col,
1199 col_gap,
1200 gap_char,
1201 score_matrix,
1202 trace_matrix,
1203 in_process,
1204 gap_fn,
1205 target,
1206 index,
1207 direction,
1208 best_score,
1209 align_globally,
1210 ):
1211 """Find the starting point(s) of the extended gap (PRIVATE)."""
1212 dead_end = False
1213 target_score = score_matrix[row][col]
1214 for n in range(target):
1215 if direction == "col":
1216 col -= 1
1217 ali_seqA += gap_char
1218 ali_seqB += sequenceB[col : col + 1]
1219 else:
1220 row -= 1
1221 ali_seqA += sequenceA[row : row + 1]
1222 ali_seqB += gap_char
1223 actual_score = score_matrix[row][col] + gap_fn(index, n + 1)
1224 if not align_globally and score_matrix[row][col] == best_score:
1225 # We have run through a 'zero-score' extension and discard it
1226 dead_end = True
1227 break
1228 if rint(actual_score) == rint(target_score) and n > 0:
1229 if not trace_matrix[row][col]:
1230 break
1231 else:
1232 in_process.append(
1233 (
1234 ali_seqA[:],
1235 ali_seqB[:],
1236 end,
1237 row,
1238 col,
1239 col_gap,
1240 trace_matrix[row][col],
1241 )
1242 )
1243 if not trace_matrix[row][col]:
1244 dead_end = True
1245 return ali_seqA, ali_seqB, row, col, in_process, dead_end
1246
1247
1248 _PRECISION = 1000
1249
1250
1251 def rint(x, precision=_PRECISION):
1252 """Print number with declared precision."""
1253 return int(x * precision + 0.5)
1254
1255
1256 class identity_match:
1257 """Create a match function for use in an alignment.
1258
1259 match and mismatch are the scores to give when two residues are equal
1260 or unequal. By default, match is 1 and mismatch is 0.
1261 """
1262
1263 def __init__(self, match=1, mismatch=0):
1264 """Initialize the class."""
1265 self.match = match
1266 self.mismatch = mismatch
1267
1268 def __call__(self, charA, charB):
1269 """Call a match function instance already created."""
1270 if charA == charB:
1271 return self.match
1272 return self.mismatch
1273
1274
1275 class dictionary_match:
1276 """Create a match function for use in an alignment.
1277
1278 Attributes:
1279 - score_dict - A dictionary where the keys are tuples (residue 1,
1280 residue 2) and the values are the match scores between those residues.
1281 - symmetric - A flag that indicates whether the scores are symmetric.
1282
1283 """
1284
1285 def __init__(self, score_dict, symmetric=1):
1286 """Initialize the class."""
1287 if isinstance(score_dict, substitution_matrices.Array):
1288 score_dict = dict(score_dict) # Access to dict is much faster
1289 self.score_dict = score_dict
1290 self.symmetric = symmetric
1291
1292 def __call__(self, charA, charB):
1293 """Call a dictionary match instance already created."""
1294 if self.symmetric and (charA, charB) not in self.score_dict:
1295 # If the score dictionary is symmetric, then look up the
1296 # score both ways.
1297 charB, charA = charA, charB
1298 return self.score_dict[(charA, charB)]
1299
1300
1301 class affine_penalty:
1302 """Create a gap function for use in an alignment."""
1303
1304 def __init__(self, open, extend, penalize_extend_when_opening=0):
1305 """Initialize the class."""
1306 if open > 0 or extend > 0:
1307 raise ValueError("Gap penalties should be non-positive.")
1308 if not penalize_extend_when_opening and (extend < open):
1309 raise ValueError(
1310 "Gap opening penalty should be higher than "
1311 "gap extension penalty (or equal)"
1312 )
1313 self.open, self.extend = open, extend
1314 self.penalize_extend_when_opening = penalize_extend_when_opening
1315
1316 def __call__(self, index, length):
1317 """Call a gap function instance already created."""
1318 return calc_affine_penalty(
1319 length, self.open, self.extend, self.penalize_extend_when_opening
1320 )
1321
1322
1323 def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
1324 """Calculate a penalty score for the gap function."""
1325 if length <= 0:
1326 return 0.0
1327 penalty = open + extend * length
1328 if not penalize_extend_when_opening:
1329 penalty -= extend
1330 return penalty
1331
1332
1333 def print_matrix(matrix):
1334 """Print out a matrix for debugging purposes."""
1335 # Transpose the matrix and get the length of the values in each column.
1336 matrixT = [[] for x in range(len(matrix[0]))]
1337 for i in range(len(matrix)):
1338 for j in range(len(matrix[i])):
1339 matrixT[j].append(len(str(matrix[i][j])))
1340 ndigits = [max(x) for x in matrixT]
1341 for i in range(len(matrix)):
1342 # Using string formatting trick to add leading spaces,
1343 print(
1344 " ".join("%*s " % (ndigits[j], matrix[i][j]) for j in range(len(matrix[i])))
1345 )
1346
1347
1348 def format_alignment(align1, align2, score, begin, end, full_sequences=False):
1349 """Format the alignment prettily into a string.
1350
1351 IMPORTANT: Gap symbol must be "-" (or ['-'] for lists)!
1352
1353 Since Biopython 1.71 identical matches are shown with a pipe
1354 character, mismatches as a dot, and gaps as a space.
1355
1356 Prior releases just used the pipe character to indicate the
1357 aligned region (matches, mismatches and gaps).
1358
1359 Also, in local alignments, if the alignment does not include
1360 the whole sequences, now only the aligned part is shown,
1361 together with the start positions of the aligned subsequences.
1362 The start positions are 1-based; so start position n is the
1363 n-th base/amino acid in the *un-aligned* sequence.
1364
1365 NOTE: This is different to the alignment's begin/end values,
1366 which give the Python indices (0-based) of the bases/amino acids
1367 in the *aligned* sequences.
1368
1369 If you want to restore the 'historic' behaviour, that means
1370 displaying the whole sequences (including the non-aligned parts),
1371 use ``full_sequences=True``. In this case, the non-aligned leading
1372 and trailing parts are also indicated by spaces in the match-line.
1373 """
1374 align_begin = begin
1375 align_end = end
1376 start1 = start2 = ""
1377 start_m = begin # Begin of match line (how many spaces to include)
1378 # For local alignments:
1379 if not full_sequences and (begin != 0 or end != len(align1)):
1380 # Calculate the actual start positions in the un-aligned sequences
1381 # This will only work if the gap symbol is '-' or ['-']!
1382 start1 = str(len(align1[:begin]) - align1[:begin].count("-") + 1) + " "
1383 start2 = str(len(align2[:begin]) - align2[:begin].count("-") + 1) + " "
1384 start_m = max(len(start1), len(start2))
1385 elif full_sequences:
1386 start_m = 0
1387 begin = 0
1388 end = len(align1)
1389
1390 if isinstance(align1, list):
1391 # List elements will be separated by spaces, since they can be
1392 # of different lengths
1393 align1 = [a + " " for a in align1]
1394 align2 = [a + " " for a in align2]
1395
1396 s1_line = ["{:>{width}}".format(start1, width=start_m)] # seq1 line
1397 m_line = [" " * start_m] # match line
1398 s2_line = ["{:>{width}}".format(start2, width=start_m)] # seq2 line
1399
1400 for n, (a, b) in enumerate(zip(align1[begin:end], align2[begin:end])):
1401 # Since list elements can be of different length, we center them,
1402 # using the maximum length of the two compared elements as width
1403 m_len = max(len(a), len(b))
1404 s1_line.append("{:^{width}}".format(a, width=m_len))
1405 s2_line.append("{:^{width}}".format(b, width=m_len))
1406 if full_sequences and (n < align_begin or n >= align_end):
1407 m_line.append("{:^{width}}".format(" ", width=m_len)) # space
1408 continue
1409 if a == b:
1410 m_line.append("{:^{width}}".format("|", width=m_len)) # match
1411 elif a.strip() == "-" or b.strip() == "-":
1412 m_line.append("{:^{width}}".format(" ", width=m_len)) # gap
1413 else:
1414 m_line.append("{:^{width}}".format(".", width=m_len)) # mismatch
1415
1416 s2_line.append(f"\n Score={score:g}\n")
1417 return "\n".join(["".join(s1_line), "".join(m_line), "".join(s2_line)])
1418
1419
1420 # Try and load C implementations of functions. If I can't,
1421 # then throw a warning and use the pure Python implementations.
1422 # The redefinition is deliberate, thus the no quality assurance
1423 # flag for when using flake8.
1424 # Before, we secure access to the pure Python functions (for testing purposes):
1425
1426 _python_make_score_matrix_fast = _make_score_matrix_fast
1427 _python_rint = rint
1428
1429 try:
1430 from .cpairwise2 import rint, _make_score_matrix_fast # noqa
1431 except ImportError:
1432 warnings.warn(
1433 "Import of C module failed. Falling back to pure Python "
1434 "implementation. This may be slooow...",
1435 BiopythonWarning,
1436 )
1437
1438 if __name__ == "__main__":
1439 from Bio._utils import run_doctest
1440
1441 run_doctest()