Package Bio :: Module pairwise2
[hide private]
[frames] | no frames]

Source Code for Module Bio.pairwise2

  1  # Copyright 2002 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This package implements pairwise sequence alignment using a dynamic 
  7  programming algorithm. 
  8   
  9  This provides functions to get global and local alignments between two 
 10  sequences.  A global alignment finds the best concordance between all 
 11  characters in two sequences.  A local alignment finds just the 
 12  subsequences that align the best. 
 13   
 14  When doing alignments, you can specify the match score and gap 
 15  penalties.  The match score indicates the compatibility between an 
 16  alignment of two characters in the sequences.  Highly compatible 
 17  characters should be given positive scores, and incompatible ones 
 18  should be given negative scores or 0.  The gap penalties should be 
 19  negative. 
 20   
 21  The names of the alignment functions in this module follow the 
 22  convention 
 23  <alignment type>XX 
 24  where <alignment type> is either "global" or "local" and XX is a 2 
 25  character code indicating the parameters it takes.  The first 
 26  character indicates the parameters for matches (and mismatches), and 
 27  the second indicates the parameters for gap penalties. 
 28   
 29  The match parameters are 
 30  CODE  DESCRIPTION 
 31  x     No parameters.  Identical characters have score of 1, otherwise 0. 
 32  m     A match score is the score of identical chars, otherwise mismatch score. 
 33  d     A dictionary returns the score of any pair of characters. 
 34  c     A callback function returns scores. 
 35   
 36  The gap penalty parameters are 
 37  CODE  DESCRIPTION 
 38  x     No gap penalties. 
 39  s     Same open and extend gap penalties for both sequences. 
 40  d     The sequences have different open and extend gap penalties. 
 41  c     A callback function returns the gap penalties. 
 42   
 43  All the different alignment functions are contained in an object 
 44  "align".  For example: 
 45   
 46      >>> from Bio import pairwise2 
 47      >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG") 
 48   
 49  will return a list of the alignments between the two strings.  The 
 50  parameters of the alignment function depends on the function called. 
 51  Some examples: 
 52   
 53      # Find the best global alignment between the two sequences. 
 54      # Identical characters are given 1 point.  No points are deducted 
 55      # for mismatches or gaps. 
 56      >>> from Bio.pairwise2 import format_alignment 
 57      >>> for a in pairwise2.align.globalxx("ACCGT", "ACG"): 
 58      ...     print(format_alignment(*a)) 
 59      ACCGT 
 60      ||||| 
 61      AC-G- 
 62        Score=3 
 63      <BLANKLINE> 
 64      ACCGT 
 65      ||||| 
 66      A-CG- 
 67        Score=3 
 68      <BLANKLINE> 
 69   
 70      # Same thing as before, but with a local alignment. 
 71      >>> for a in pairwise2.align.localxx("ACCGT", "ACG"): 
 72      ...     print(format_alignment(*a)) 
 73      ACCGT 
 74      |||| 
 75      AC-G- 
 76        Score=3 
 77      <BLANKLINE> 
 78      ACCGT 
 79      |||| 
 80      A-CG- 
 81        Score=3 
 82      <BLANKLINE> 
 83   
 84      # Do a global alignment.  Identical characters are given 2 points, 
 85      # 1 point is deducted for each non-identical character. 
 86      >>> for a in pairwise2.align.globalmx("ACCGT", "ACG", 2, -1): 
 87      ...     print(format_alignment(*a)) 
 88      ACCGT 
 89      ||||| 
 90      AC-G- 
 91        Score=6 
 92      <BLANKLINE> 
 93      ACCGT 
 94      ||||| 
 95      A-CG- 
 96        Score=6 
 97      <BLANKLINE> 
 98   
 99      # Same as above, except now 0.5 points are deducted when opening a 
100      # gap, and 0.1 points are deducted when extending it. 
101      >>> for a in pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1): 
102      ...     print(format_alignment(*a)) 
103      ACCGT 
104      ||||| 
105      AC-G- 
106        Score=5 
107      <BLANKLINE> 
108      ACCGT 
109      ||||| 
110      A-CG- 
111        Score=5 
112      <BLANKLINE> 
113   
114  The alignment function can also use known matrices already included in 
115  Biopython ( Bio.SubsMat -> MatrixInfo ). 
116   
117      >>> from Bio.SubsMat import MatrixInfo as matlist 
118      >>> matrix = matlist.blosum62 
119      >>> for a in pairwise2.align.globaldx("KEVLA", "EVL", matrix): 
120      ...     print(format_alignment(*a)) 
121      KEVLA 
122      ||||| 
123      -EVL- 
124        Score=13 
125      <BLANKLINE> 
126   
127  To see a description of the parameters for a function, please look at 
128  the docstring for the function via the help function, e.g. 
129  type help(pairwise2.align.localds) at the Python prompt. 
130  """ 
131  # The alignment functions take some undocumented keyword parameters: 
132  # - penalize_extend_when_opening: boolean 
133  #   Whether to count an extension penalty when opening a gap.  If 
134  #   false, a gap of 1 is only penalize an "open" penalty, otherwise it 
135  #   is penalized "open+extend". 
136  # - penalize_end_gaps: boolean 
137  #   Whether to count the gaps at the ends of an alignment.  By 
138  #   default, they are counted for global alignments but not for local 
139  #   ones. Setting penalize_end_gaps to (boolean, boolean) allows you to 
140  #   specify for the two sequences separately whether gaps at the end of 
141  #   the alignment should be counted. 
142  # - gap_char: string 
143  #   Which character to use as a gap character in the alignment 
144  #   returned.  By default, uses '-'. 
145  # - force_generic: boolean 
146  #   Always use the generic, non-cached, dynamic programming function. 
147  #   For debugging. 
148  # - score_only: boolean 
149  #   Only get the best score, don't recover any alignments.  The return 
150  #   value of the function is the score. 
151  # - one_alignment_only: boolean 
152  #   Only recover one alignment. 
153   
154  from __future__ import print_function 
155   
156  MAX_ALIGNMENTS = 1000   # maximum alignments recovered in traceback 
157   
158   
159 -class align(object):
160 """This class provides functions that do alignments.""" 161
162 - class alignment_function:
163 """This class is callable impersonates an alignment function. 164 The constructor takes the name of the function. This class 165 will decode the name of the function to figure out how to 166 interpret the parameters. 167 168 """ 169 # match code -> tuple of (parameters, docstring) 170 match2args = { 171 'x': ([], ''), 172 'm': (['match', 'mismatch'], 173 """match is the score to given to identical characters. mismatch is 174 the score given to non-identical ones."""), 175 'd': (['match_dict'], 176 """match_dict is a dictionary where the keys are tuples of pairs of 177 characters and the values are the scores, e.g. ("A", "C") : 2.5."""), 178 'c': (['match_fn'], 179 """match_fn is a callback function that takes two characters and 180 returns the score between them."""), 181 } 182 # penalty code -> tuple of (parameters, docstring) 183 penalty2args = { 184 'x': ([], ''), 185 's': (['open', 'extend'], 186 """open and extend are the gap penalties when a gap is opened and 187 extended. They should be negative."""), 188 'd': (['openA', 'extendA', 'openB', 'extendB'], 189 """openA and extendA are the gap penalties for sequenceA, and openB 190 and extendB for sequeneB. The penalties should be negative."""), 191 'c': (['gap_A_fn', 'gap_B_fn'], 192 """gap_A_fn and gap_B_fn are callback functions that takes 1) the 193 index where the gap is opened, and 2) the length of the gap. They 194 should return a gap penalty."""), 195 } 196
197 - def __init__(self, name):
198 # Check to make sure the name of the function is 199 # reasonable. 200 if name.startswith("global"): 201 if len(name) != 8: 202 raise AttributeError("function should be globalXX") 203 elif name.startswith("local"): 204 if len(name) != 7: 205 raise AttributeError("function should be localXX") 206 else: 207 raise AttributeError(name) 208 align_type, match_type, penalty_type = \ 209 name[:-2], name[-2], name[-1] 210 try: 211 match_args, match_doc = self.match2args[match_type] 212 except KeyError as x: 213 raise AttributeError("unknown match type %r" % match_type) 214 try: 215 penalty_args, penalty_doc = self.penalty2args[penalty_type] 216 except KeyError as x: 217 raise AttributeError("unknown penalty type %r" % penalty_type) 218 219 # Now get the names of the parameters to this function. 220 param_names = ['sequenceA', 'sequenceB'] 221 param_names.extend(match_args) 222 param_names.extend(penalty_args) 223 self.function_name = name 224 self.align_type = align_type 225 self.param_names = param_names 226 227 self.__name__ = self.function_name 228 # Set the doc string. 229 doc = "%s(%s) -> alignments\n" % ( 230 self.__name__, ', '.join(self.param_names)) 231 if match_doc: 232 doc += "\n%s\n" % match_doc 233 if penalty_doc: 234 doc += "\n%s\n" % penalty_doc 235 doc += ( 236 """\nalignments is a list of tuples (seqA, seqB, score, begin, end). 237 seqA and seqB are strings showing the alignment between the 238 sequences. score is the score of the alignment. begin and end 239 are indexes into seqA and seqB that indicate the where the 240 alignment occurs. 241 """) 242 self.__doc__ = doc
243
244 - def decode(self, *args, **keywds):
245 # Decode the arguments for the _align function. keywds 246 # will get passed to it, so translate the arguments to 247 # this function into forms appropriate for _align. 248 keywds = keywds.copy() 249 if len(args) != len(self.param_names): 250 raise TypeError("%s takes exactly %d argument (%d given)" 251 % (self.function_name, len(self.param_names), len(args))) 252 i = 0 253 while i < len(self.param_names): 254 if self.param_names[i] in [ 255 'sequenceA', 'sequenceB', 256 'gap_A_fn', 'gap_B_fn', 'match_fn']: 257 keywds[self.param_names[i]] = args[i] 258 i += 1 259 elif self.param_names[i] == 'match': 260 assert self.param_names[i+1] == 'mismatch' 261 match, mismatch = args[i], args[i+1] 262 keywds['match_fn'] = identity_match(match, mismatch) 263 i += 2 264 elif self.param_names[i] == 'match_dict': 265 keywds['match_fn'] = dictionary_match(args[i]) 266 i += 1 267 elif self.param_names[i] == 'open': 268 assert self.param_names[i+1] == 'extend' 269 open, extend = args[i], args[i+1] 270 pe = keywds.get('penalize_extend_when_opening', 0) 271 keywds['gap_A_fn'] = affine_penalty(open, extend, pe) 272 keywds['gap_B_fn'] = affine_penalty(open, extend, pe) 273 i += 2 274 elif self.param_names[i] == 'openA': 275 assert self.param_names[i+3] == 'extendB' 276 openA, extendA, openB, extendB = args[i:i+4] 277 pe = keywds.get('penalize_extend_when_opening', 0) 278 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe) 279 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe) 280 i += 4 281 else: 282 raise ValueError("unknown parameter %r" 283 % self.param_names[i]) 284 285 # Here are the default parameters for _align. Assign 286 # these to keywds, unless already specified. 287 pe = keywds.get('penalize_extend_when_opening', 0) 288 default_params = [ 289 ('match_fn', identity_match(1, 0)), 290 ('gap_A_fn', affine_penalty(0, 0, pe)), 291 ('gap_B_fn', affine_penalty(0, 0, pe)), 292 ('penalize_extend_when_opening', 0), 293 ('penalize_end_gaps', self.align_type == 'global'), 294 ('align_globally', self.align_type == 'global'), 295 ('gap_char', '-'), 296 ('force_generic', 0), 297 ('score_only', 0), 298 ('one_alignment_only', 0) 299 ] 300 for name, default in default_params: 301 keywds[name] = keywds.get(name, default) 302 value = keywds['penalize_end_gaps'] 303 try: 304 n = len(value) 305 except TypeError: 306 keywds['penalize_end_gaps'] = tuple([value]*2) 307 else: 308 assert n==2 309 return keywds
310
311 - def __call__(self, *args, **keywds):
312 keywds = self.decode(*args, **keywds) 313 return _align(**keywds)
314
315 - def __getattr__(self, attr):
316 return self.alignment_function(attr)
317 align = align() 318 319
320 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 321 penalize_extend_when_opening, penalize_end_gaps, 322 align_globally, gap_char, force_generic, score_only, 323 one_alignment_only):
324 if not sequenceA or not sequenceB: 325 return [] 326 327 if (not force_generic) and isinstance(gap_A_fn, affine_penalty) \ 328 and isinstance(gap_B_fn, affine_penalty): 329 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend 330 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend 331 x = _make_score_matrix_fast( 332 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 333 penalize_extend_when_opening, penalize_end_gaps, align_globally, 334 score_only) 335 else: 336 x = _make_score_matrix_generic( 337 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 338 penalize_extend_when_opening, penalize_end_gaps, align_globally, 339 score_only) 340 score_matrix, trace_matrix = x 341 342 #print("SCORE %s" % print_matrix(score_matrix)) 343 #print("TRACEBACK %s" % print_matrix(trace_matrix)) 344 345 # Look for the proper starting point. Get a list of all possible 346 # starting points. 347 starts = _find_start( 348 score_matrix, sequenceA, sequenceB, 349 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally) 350 # Find the highest score. 351 best_score = max([x[0] for x in starts]) 352 353 # If they only want the score, then return it. 354 if score_only: 355 return best_score 356 357 tolerance = 0 # XXX do anything with this? 358 # Now find all the positions within some tolerance of the best 359 # score. 360 i = 0 361 while i < len(starts): 362 score, pos = starts[i] 363 if rint(abs(score-best_score)) > rint(tolerance): 364 del starts[i] 365 else: 366 i += 1 367 368 # Recover the alignments and return them. 369 x = _recover_alignments( 370 sequenceA, sequenceB, starts, score_matrix, trace_matrix, 371 align_globally, gap_char, one_alignment_only) 372 return x
373 374
375 -def _make_score_matrix_generic( 376 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 377 penalize_extend_when_opening, penalize_end_gaps, align_globally, 378 score_only):
379 # This is an implementation of the Needleman-Wunsch dynamic 380 # programming algorithm for aligning sequences. 381 382 # Create the score and traceback matrices. These should be in the 383 # shape: 384 # sequenceA (down) x sequenceB (across) 385 lenA, lenB = len(sequenceA), len(sequenceB) 386 score_matrix, trace_matrix = [], [] 387 for i in range(lenA): 388 score_matrix.append([None] * lenB) 389 trace_matrix.append([[None]] * lenB) 390 391 # The top and left borders of the matrices are special cases 392 # because there are no previously aligned characters. To simplify 393 # the main loop, handle these separately. 394 for i in range(lenA): 395 # Align the first residue in sequenceB to the ith residue in 396 # sequence A. This is like opening up i gaps at the beginning 397 # of sequence B. 398 score = match_fn(sequenceA[i], sequenceB[0]) 399 if penalize_end_gaps[1]: 400 score += gap_B_fn(0, i) 401 score_matrix[i][0] = score 402 for i in range(1, lenB): 403 score = match_fn(sequenceA[0], sequenceB[i]) 404 if penalize_end_gaps[0]: 405 score += gap_A_fn(0, i) 406 score_matrix[0][i] = score 407 408 # Fill in the score matrix. Each position in the matrix 409 # represents an alignment between a character from sequenceA to 410 # one in sequence B. As I iterate through the matrix, find the 411 # alignment by choose the best of: 412 # 1) extending a previous alignment without gaps 413 # 2) adding a gap in sequenceA 414 # 3) adding a gap in sequenceB 415 for row in range(1, lenA): 416 for col in range(1, lenB): 417 # First, calculate the score that would occur by extending 418 # the alignment without gaps. 419 best_score = score_matrix[row-1][col-1] 420 best_score_rint = rint(best_score) 421 best_indexes = [(row-1, col-1)] 422 423 # Try to find a better score by opening gaps in sequenceA. 424 # Do this by checking alignments from each column in the 425 # previous row. Each column represents a different 426 # character to align from, and thus a different length 427 # gap. 428 for i in range(0, col-1): 429 score = score_matrix[row-1][i] + gap_A_fn(row, col-1-i) 430 score_rint = rint(score) 431 if score_rint == best_score_rint: 432 best_score, best_score_rint = score, score_rint 433 best_indexes.append((row-1, i)) 434 elif score_rint > best_score_rint: 435 best_score, best_score_rint = score, score_rint 436 best_indexes = [(row-1, i)] 437 438 # Try to find a better score by opening gaps in sequenceB. 439 for i in range(0, row-1): 440 score = score_matrix[i][col-1] + gap_B_fn(col, row-1-i) 441 score_rint = rint(score) 442 if score_rint == best_score_rint: 443 best_score, best_score_rint = score, score_rint 444 best_indexes.append((i, col-1)) 445 elif score_rint > best_score_rint: 446 best_score, best_score_rint = score, score_rint 447 best_indexes = [(i, col-1)] 448 449 score_matrix[row][col] = best_score + \ 450 match_fn(sequenceA[row], sequenceB[col]) 451 if not align_globally and score_matrix[row][col] < 0: 452 score_matrix[row][col] = 0 453 trace_matrix[row][col] = best_indexes 454 return score_matrix, trace_matrix
455 456
457 -def _make_score_matrix_fast( 458 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 459 penalize_extend_when_opening, penalize_end_gaps, 460 align_globally, score_only):
461 first_A_gap = calc_affine_penalty(1, open_A, extend_A, 462 penalize_extend_when_opening) 463 first_B_gap = calc_affine_penalty(1, open_B, extend_B, 464 penalize_extend_when_opening) 465 466 # Create the score and traceback matrices. These should be in the 467 # shape: 468 # sequenceA (down) x sequenceB (across) 469 lenA, lenB = len(sequenceA), len(sequenceB) 470 score_matrix, trace_matrix = [], [] 471 for i in range(lenA): 472 score_matrix.append([None] * lenB) 473 trace_matrix.append([[None]] * lenB) 474 475 # The top and left borders of the matrices are special cases 476 # because there are no previously aligned characters. To simplify 477 # the main loop, handle these separately. 478 for i in range(lenA): 479 # Align the first residue in sequenceB to the ith residue in 480 # sequence A. This is like opening up i gaps at the beginning 481 # of sequence B. 482 score = match_fn(sequenceA[i], sequenceB[0]) 483 if penalize_end_gaps[1]: 484 score += calc_affine_penalty( 485 i, open_B, extend_B, penalize_extend_when_opening) 486 score_matrix[i][0] = score 487 for i in range(1, lenB): 488 score = match_fn(sequenceA[0], sequenceB[i]) 489 if penalize_end_gaps[0]: 490 score += calc_affine_penalty( 491 i, open_A, extend_A, penalize_extend_when_opening) 492 score_matrix[0][i] = score 493 494 # In the generic algorithm, at each row and column in the score 495 # matrix, we had to scan all previous rows and columns to see 496 # whether opening a gap might yield a higher score. Here, since 497 # we know the penalties are affine, we can cache just the best 498 # score in the previous rows and columns. Instead of scanning 499 # through all the previous rows and cols, we can just look at the 500 # cache for the best one. Whenever the row or col increments, the 501 # best cached score just decreases by extending the gap longer. 502 503 # The best score and indexes for each row (goes down all columns). 504 # I don't need to store the last row because it's the end of the 505 # sequence. 506 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1) 507 # The best score and indexes for each column (goes across rows). 508 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1) 509 510 for i in range(lenA-1): 511 # Initialize each row to be the alignment of sequenceA[i] to 512 # sequenceB[0], plus opening a gap in sequenceA. 513 row_cache_score[i] = score_matrix[i][0] + first_A_gap 514 row_cache_index[i] = [(i, 0)] 515 for i in range(lenB-1): 516 col_cache_score[i] = score_matrix[0][i] + first_B_gap 517 col_cache_index[i] = [(0, i)] 518 519 # Fill in the score_matrix. 520 for row in range(1, lenA): 521 for col in range(1, lenB): 522 # Calculate the score that would occur by extending the 523 # alignment without gaps. 524 nogap_score = score_matrix[row-1][col-1] 525 526 # Check the score that would occur if there were a gap in 527 # sequence A. 528 if col > 1: 529 row_score = row_cache_score[row-1] 530 else: 531 row_score = nogap_score - 1 # Make sure it's not the best. 532 # Check the score that would occur if there were a gap in 533 # sequence B. 534 if row > 1: 535 col_score = col_cache_score[col-1] 536 else: 537 col_score = nogap_score - 1 538 539 best_score = max(nogap_score, row_score, col_score) 540 best_score_rint = rint(best_score) 541 best_index = [] 542 if best_score_rint == rint(nogap_score): 543 best_index.append((row-1, col-1)) 544 if best_score_rint == rint(row_score): 545 best_index.extend(row_cache_index[row-1]) 546 if best_score_rint == rint(col_score): 547 best_index.extend(col_cache_index[col-1]) 548 549 # Set the score and traceback matrices. 550 score = best_score + match_fn(sequenceA[row], sequenceB[col]) 551 if not align_globally and score < 0: 552 score_matrix[row][col] = 0 553 else: 554 score_matrix[row][col] = score 555 trace_matrix[row][col] = best_index 556 557 # Update the cached column scores. The best score for 558 # this can come from either extending the gap in the 559 # previous cached score, or opening a new gap from the 560 # most previously seen character. Compare the two scores 561 # and keep the best one. 562 open_score = score_matrix[row-1][col-1] + first_B_gap 563 extend_score = col_cache_score[col-1] + extend_B 564 open_score_rint, extend_score_rint = \ 565 rint(open_score), rint(extend_score) 566 if open_score_rint > extend_score_rint: 567 col_cache_score[col-1] = open_score 568 col_cache_index[col-1] = [(row-1, col-1)] 569 elif extend_score_rint > open_score_rint: 570 col_cache_score[col-1] = extend_score 571 else: 572 col_cache_score[col-1] = open_score 573 if (row-1, col-1) not in col_cache_index[col-1]: 574 col_cache_index[col-1] = col_cache_index[col-1] + \ 575 [(row-1, col-1)] 576 577 # Update the cached row scores. 578 open_score = score_matrix[row-1][col-1] + first_A_gap 579 extend_score = row_cache_score[row-1] + extend_A 580 open_score_rint, extend_score_rint = \ 581 rint(open_score), rint(extend_score) 582 if open_score_rint > extend_score_rint: 583 row_cache_score[row-1] = open_score 584 row_cache_index[row-1] = [(row-1, col-1)] 585 elif extend_score_rint > open_score_rint: 586 row_cache_score[row-1] = extend_score 587 else: 588 row_cache_score[row-1] = open_score 589 if (row-1, col-1) not in row_cache_index[row-1]: 590 row_cache_index[row-1] = row_cache_index[row-1] + \ 591 [(row-1, col-1)] 592 593 return score_matrix, trace_matrix
594 595
596 -def _recover_alignments(sequenceA, sequenceB, starts, 597 score_matrix, trace_matrix, align_globally, 598 gap_char, one_alignment_only):
599 # Recover the alignments by following the traceback matrix. This 600 # is a recursive procedure, but it's implemented here iteratively 601 # with a stack. 602 lenA, lenB = len(sequenceA), len(sequenceB) 603 tracebacks = [] # list of (seq1, seq2, score, begin, end) 604 in_process = [] # list of ([same as tracebacks], prev_pos, next_pos) 605 606 # sequenceA and sequenceB may be sequences, including strings, 607 # lists, or list-like objects. In order to preserve the type of 608 # the object, we need to use slices on the sequences instead of 609 # indexes. For example, sequenceA[row] may return a type that's 610 # not compatible with sequenceA, e.g. if sequenceA is a list and 611 # sequenceA[row] is a string. Thus, avoid using indexes and use 612 # slices, e.g. sequenceA[row:row+1]. Assume that client-defined 613 # sequence classes preserve these semantics. 614 615 # Initialize the in_process stack 616 for score, (row, col) in starts: 617 if align_globally: 618 begin, end = None, None 619 else: 620 begin, end = None, -max(lenA-row, lenB-col)+1 621 if not end: 622 end = None 623 # Initialize the in_process list with empty sequences of the 624 # same type as sequenceA. To do this, take empty slices of 625 # the sequences. 626 in_process.append( 627 (sequenceA[0:0], sequenceB[0:0], score, begin, end, 628 (lenA, lenB), (row, col))) 629 if one_alignment_only: 630 break 631 while in_process and len(tracebacks) < MAX_ALIGNMENTS: 632 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop() 633 prevA, prevB = prev_pos 634 if next_pos is None: 635 prevlen = len(seqA) 636 # add the rest of the sequences 637 seqA = sequenceA[:prevA] + seqA 638 seqB = sequenceB[:prevB] + seqB 639 # add the rest of the gaps 640 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char) 641 642 # Now make sure begin is set. 643 if begin is None: 644 if align_globally: 645 begin = 0 646 else: 647 begin = len(seqA) - prevlen 648 tracebacks.append((seqA, seqB, score, begin, end)) 649 else: 650 nextA, nextB = next_pos 651 nseqA, nseqB = prevA-nextA, prevB-nextB 652 maxseq = max(nseqA, nseqB) 653 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB 654 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA 655 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB 656 prev_pos = next_pos 657 # local alignment stops early if score falls < 0 658 if not align_globally and score_matrix[nextA][nextB] <= 0: 659 begin = max(prevA, prevB) 660 in_process.append( 661 (seqA, seqB, score, begin, end, prev_pos, None)) 662 else: 663 for next_pos in trace_matrix[nextA][nextB]: 664 in_process.append( 665 (seqA, seqB, score, begin, end, prev_pos, next_pos)) 666 if one_alignment_only: 667 break 668 669 return _clean_alignments(tracebacks)
670 671
672 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn, 673 penalize_end_gaps, align_globally):
674 # Return a list of (score, (row, col)) indicating every possible 675 # place to start the tracebacks. 676 if align_globally: 677 starts = _find_global_start( 678 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps) 679 else: 680 starts = _find_local_start(score_matrix) 681 return starts
682 683
684 -def _find_global_start(sequenceA, sequenceB, 685 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
686 # The whole sequence should be aligned, so return the positions at 687 # the end of either one of the sequences. 688 nrows, ncols = len(score_matrix), len(score_matrix[0]) 689 positions = [] 690 # Search all rows in the last column. 691 for row in range(nrows): 692 # Find the score, penalizing end gaps if necessary. 693 score = score_matrix[row][ncols-1] 694 if penalize_end_gaps[1]: 695 score += gap_B_fn(ncols, nrows-row-1) 696 positions.append((score, (row, ncols-1))) 697 # Search all columns in the last row. 698 for col in range(ncols-1): 699 score = score_matrix[nrows-1][col] 700 if penalize_end_gaps[0]: 701 score += gap_A_fn(nrows, ncols-col-1) 702 positions.append((score, (nrows-1, col))) 703 return positions
704 705
706 -def _find_local_start(score_matrix):
707 # Return every position in the matrix. 708 positions = [] 709 nrows, ncols = len(score_matrix), len(score_matrix[0]) 710 for row in range(nrows): 711 for col in range(ncols): 712 score = score_matrix[row][col] 713 positions.append((score, (row, col))) 714 return positions
715 716
717 -def _clean_alignments(alignments):
718 # Take a list of alignments and return a cleaned version. Remove 719 # duplicates, make sure begin and end are set correctly, remove 720 # empty alignments. 721 unique_alignments = [] 722 for align in alignments: 723 if align not in unique_alignments: 724 unique_alignments.append(align) 725 i = 0 726 while i < len(unique_alignments): 727 seqA, seqB, score, begin, end = unique_alignments[i] 728 # Make sure end is set reasonably. 729 if end is None: # global alignment 730 end = len(seqA) 731 elif end < 0: 732 end = end + len(seqA) 733 # If there's no alignment here, get rid of it. 734 if begin >= end: 735 del unique_alignments[i] 736 continue 737 unique_alignments[i] = seqA, seqB, score, begin, end 738 i += 1 739 return unique_alignments
740 741
742 -def _pad_until_equal(s1, s2, char):
743 # Add char to the end of s1 or s2 until they are equal length. 744 ls1, ls2 = len(s1), len(s2) 745 if ls1 < ls2: 746 s1 = _pad(s1, char, ls2-ls1) 747 elif ls2 < ls1: 748 s2 = _pad(s2, char, ls1-ls2) 749 return s1, s2
750 751
752 -def _lpad_until_equal(s1, s2, char):
753 # Add char to the beginning of s1 or s2 until they are equal 754 # length. 755 ls1, ls2 = len(s1), len(s2) 756 if ls1 < ls2: 757 s1 = _lpad(s1, char, ls2-ls1) 758 elif ls2 < ls1: 759 s2 = _lpad(s2, char, ls1-ls2) 760 return s1, s2
761 762
763 -def _pad(s, char, n):
764 # Append n chars to the end of s. 765 return s + char*n
766 767
768 -def _lpad(s, char, n):
769 # Prepend n chars to the beginning of s. 770 return char*n + s
771 772 _PRECISION = 1000 773 774
775 -def rint(x, precision=_PRECISION):
776 return int(x * precision + 0.5)
777 778
779 -class identity_match:
780 """identity_match([match][, mismatch]) -> match_fn 781 782 Create a match function for use in an alignment. match and 783 mismatch are the scores to give when two residues are equal or 784 unequal. By default, match is 1 and mismatch is 0. 785 786 """
787 - def __init__(self, match=1, mismatch=0):
788 self.match = match 789 self.mismatch = mismatch
790
791 - def __call__(self, charA, charB):
792 if charA == charB: 793 return self.match 794 return self.mismatch
795 796
797 -class dictionary_match:
798 """dictionary_match(score_dict[, symmetric]) -> match_fn 799 800 Create a match function for use in an alignment. score_dict is a 801 dictionary where the keys are tuples (residue 1, residue 2) and 802 the values are the match scores between those residues. symmetric 803 is a flag that indicates whether the scores are symmetric. If 804 true, then if (res 1, res 2) doesn't exist, I will use the score 805 at (res 2, res 1). 806 807 """
808 - def __init__(self, score_dict, symmetric=1):
809 self.score_dict = score_dict 810 self.symmetric = symmetric
811
812 - def __call__(self, charA, charB):
813 if self.symmetric and (charA, charB) not in self.score_dict: 814 # If the score dictionary is symmetric, then look up the 815 # score both ways. 816 charB, charA = charA, charB 817 return self.score_dict[(charA, charB)]
818 819
820 -class affine_penalty:
821 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn 822 823 Create a gap function for use in an alignment. 824 825 """
826 - def __init__(self, open, extend, penalize_extend_when_opening=0):
827 if open > 0 or extend > 0: 828 raise ValueError("Gap penalties should be non-positive.") 829 self.open, self.extend = open, extend 830 self.penalize_extend_when_opening = penalize_extend_when_opening
831
832 - def __call__(self, index, length):
833 return calc_affine_penalty( 834 length, self.open, self.extend, self.penalize_extend_when_opening)
835 836
837 -def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
838 if length <= 0: 839 return 0 840 penalty = open + extend * length 841 if not penalize_extend_when_opening: 842 penalty -= extend 843 return penalty
844 845 862 863
864 -def format_alignment(align1, align2, score, begin, end):
865 """format_alignment(align1, align2, score, begin, end) -> string 866 867 Format the alignment prettily into a string. 868 869 """ 870 s = [] 871 s.append("%s\n" % align1) 872 s.append("%s%s\n" % (" "*begin, "|"*(end-begin))) 873 s.append("%s\n" % align2) 874 s.append(" Score=%g\n" % score) 875 return ''.join(s)
876 877 878 # Try and load C implementations of functions. If I can't, 879 # then just ignore and use the pure python implementations. 880 try: 881 from cpairwise2 import rint, _make_score_matrix_fast 882 except ImportError: 883 pass 884 885
886 -def _test():
887 """Run the module's doctests (PRIVATE).""" 888 print("Running doctests...") 889 import doctest 890 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 891 print("Done")
892 893 if __name__ == "__main__": 894 _test() 895