Package Bio :: Package Align
[hide private]
[frames] | no frames]

Source Code for Package Bio.Align

  1  # Copyright 2008-2011 by Peter Cock. 
  2  # All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Code for dealing with sequence alignments. 
  7   
  8  One of the most important things in this module is the MultipleSeqAlignment 
  9  class, used in the Bio.AlignIO module. 
 10   
 11  """ 
 12  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio import Alphabet 
 17   
 18  #We only import this and subclass it for some limited backward compatibility. 
 19  from Bio.Align.Generic import Alignment as _Alignment 
 20   
 21   
22 -class MultipleSeqAlignment(_Alignment):
23 """Represents a classical multiple sequence alignment (MSA). 24 25 By this we mean a collection of sequences (usually shown as rows) which 26 are all the same length (usually with gap characters for insertions or 27 padding). The data can then be regarded as a matrix of letters, with well 28 defined columns. 29 30 You would typically create an MSA by loading an alignment file with the 31 AlignIO module: 32 33 >>> from Bio import AlignIO 34 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 35 >>> print align 36 SingleLetterAlphabet() alignment with 7 rows and 156 columns 37 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 38 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 39 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 40 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 41 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 42 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 43 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 44 45 In some respects you can treat these objects as lists of SeqRecord objects, 46 each representing a row of the alignment. Iterating over an alignment gives 47 the SeqRecord object for each row: 48 49 >>> len(align) 50 7 51 >>> for record in align: 52 ... print record.id, len(record) 53 gi|6273285|gb|AF191659.1|AF191 156 54 gi|6273284|gb|AF191658.1|AF191 156 55 gi|6273287|gb|AF191661.1|AF191 156 56 gi|6273286|gb|AF191660.1|AF191 156 57 gi|6273290|gb|AF191664.1|AF191 156 58 gi|6273289|gb|AF191663.1|AF191 156 59 gi|6273291|gb|AF191665.1|AF191 156 60 61 You can also access individual rows as SeqRecord objects via their index: 62 63 >>> print align[0].id 64 gi|6273285|gb|AF191659.1|AF191 65 >>> print align[-1].id 66 gi|6273291|gb|AF191665.1|AF191 67 68 And extract columns as strings: 69 70 >>> print align[:,1] 71 AAAAAAA 72 73 Or, take just the first ten columns as a sub-alignment: 74 75 >>> print align[:,:10] 76 SingleLetterAlphabet() alignment with 7 rows and 10 columns 77 TATACATTAA gi|6273285|gb|AF191659.1|AF191 78 TATACATTAA gi|6273284|gb|AF191658.1|AF191 79 TATACATTAA gi|6273287|gb|AF191661.1|AF191 80 TATACATAAA gi|6273286|gb|AF191660.1|AF191 81 TATACATTAA gi|6273290|gb|AF191664.1|AF191 82 TATACATTAA gi|6273289|gb|AF191663.1|AF191 83 TATACATTAA gi|6273291|gb|AF191665.1|AF191 84 85 Combining this alignment slicing with alignment addition allows you to 86 remove a section of the alignment. For example, taking just the first 87 and last ten columns: 88 89 >>> print align[:,:10] + align[:,-10:] 90 SingleLetterAlphabet() alignment with 7 rows and 20 columns 91 TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191 92 TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191 93 TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191 94 TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191 95 TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191 96 TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191 97 TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191 98 99 Note - This object is intended to replace the existing Alignment object 100 defined in module Bio.Align.Generic but is not fully backwards compatible 101 with it. 102 103 Note - This object does NOT attempt to model the kind of alignments used 104 in next generation sequencing with multiple sequencing reads which are 105 much shorter than the alignment, and where there is usually a consensus or 106 reference sequence with special status. 107 """ 108
109 - def __init__(self, records, alphabet=None, 110 annotations=None):
111 """Initialize a new MultipleSeqAlignment object. 112 113 Arguments: 114 - records - A list (or iterator) of SeqRecord objects, whose 115 sequences are all the same length. This may be an be an 116 empty list. 117 - alphabet - The alphabet for the whole alignment, typically a gapped 118 alphabet, which should be a super-set of the individual 119 record alphabets. If omitted, a consensus alphabet is 120 used. 121 - annotations - Information about the whole alignment (dictionary). 122 123 You would normally load a MSA from a file using Bio.AlignIO, but you 124 can do this from a list of SeqRecord objects too: 125 126 >>> from Bio.Alphabet import generic_dna 127 >>> from Bio.Seq import Seq 128 >>> from Bio.SeqRecord import SeqRecord 129 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 130 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 131 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 132 >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"}) 133 >>> print align 134 DNAAlphabet() alignment with 3 rows and 7 columns 135 AAAACGT Alpha 136 AAA-CGT Beta 137 AAAAGGT Gamma 138 >>> align.annotations 139 {'tool': 'demo'} 140 141 NOTE - The older Bio.Align.Generic.Alignment class only accepted a 142 single argument, an alphabet. This is still supported via a backwards 143 compatible "hack" so as not to disrupt existing scripts and users, but 144 is deprecated and will be removed in a future release. 145 """ 146 if isinstance(records, Alphabet.Alphabet) \ 147 or isinstance(records, Alphabet.AlphabetEncoder): 148 if alphabet is None: 149 #TODO - Remove this backwards compatible mode! 150 alphabet = records 151 records = [] 152 import warnings 153 from Bio import BiopythonDeprecationWarning 154 warnings.warn("Invalid records argument: While the old " 155 "Bio.Align.Generic.Alignment class only " 156 "accepted a single argument (the alphabet), the " 157 "newer Bio.Align.MultipleSeqAlignment class " 158 "expects a list/iterator of SeqRecord objects " 159 "(which can be an empty list) and an optional " 160 "alphabet argument", BiopythonDeprecationWarning) 161 else : 162 raise ValueError("Invalid records argument") 163 if alphabet is not None : 164 if not (isinstance(alphabet, Alphabet.Alphabet) 165 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 166 raise ValueError("Invalid alphabet argument") 167 self._alphabet = alphabet 168 else : 169 #Default while we add sequences, will take a consensus later 170 self._alphabet = Alphabet.single_letter_alphabet 171 172 self._records = [] 173 if records: 174 self.extend(records) 175 if alphabet is None: 176 #No alphabet was given, take a consensus alphabet 177 self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for 178 rec in self._records 179 if rec.seq is not None) 180 181 # Annotations about the whole alignment 182 if annotations is None: 183 annotations = {} 184 elif not isinstance(annotations, dict): 185 raise TypeError("annotations argument should be a dict") 186 self.annotations = annotations
187
188 - def extend(self, records):
189 """Add more SeqRecord objects to the alignment as rows. 190 191 They must all have the same length as the original alignment, and have 192 alphabets compatible with the alignment's alphabet. For example, 193 194 >>> from Bio.Alphabet import generic_dna 195 >>> from Bio.Seq import Seq 196 >>> from Bio.SeqRecord import SeqRecord 197 >>> from Bio.Align import MultipleSeqAlignment 198 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 199 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 200 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 201 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 202 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 203 204 First we create a small alignment (three rows): 205 206 >>> align = MultipleSeqAlignment([a, b, c]) 207 >>> print align 208 DNAAlphabet() alignment with 3 rows and 7 columns 209 AAAACGT Alpha 210 AAA-CGT Beta 211 AAAAGGT Gamma 212 213 Now we can extend this alignment with another two rows: 214 215 >>> align.extend([d, e]) 216 >>> print align 217 DNAAlphabet() alignment with 5 rows and 7 columns 218 AAAACGT Alpha 219 AAA-CGT Beta 220 AAAAGGT Gamma 221 AAAACGT Delta 222 AAA-GGT Epsilon 223 224 Because the alignment object allows iteration over the rows as 225 SeqRecords, you can use the extend method with a second alignment 226 (provided its sequences have the same length as the original alignment). 227 """ 228 if len(self): 229 #Use the standard method to get the length 230 expected_length = self.get_alignment_length() 231 else: 232 #Take the first record's length 233 records = iter(records) # records arg could be list or iterator 234 try: 235 rec = records.next() 236 except StopIteration: 237 #Special case, no records 238 return 239 expected_length = len(rec) 240 self._append(rec, expected_length) 241 #Now continue to the rest of the records as usual 242 243 for rec in records: 244 self._append(rec, expected_length)
245
246 - def append(self, record):
247 """Add one more SeqRecord object to the alignment as a new row. 248 249 This must have the same length as the original alignment (unless this is 250 the first record), and have an alphabet compatible with the alignment's 251 alphabet. 252 253 >>> from Bio import AlignIO 254 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 255 >>> print align 256 SingleLetterAlphabet() alignment with 7 rows and 156 columns 257 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 258 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 259 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 260 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 261 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 262 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 263 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 264 >>> len(align) 265 7 266 267 We'll now construct a dummy record to append as an example: 268 269 >>> from Bio.Seq import Seq 270 >>> from Bio.SeqRecord import SeqRecord 271 >>> dummy = SeqRecord(Seq("N"*156), id="dummy") 272 273 Now append this to the alignment, 274 275 >>> align.append(dummy) 276 >>> print align 277 SingleLetterAlphabet() alignment with 8 rows and 156 columns 278 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 279 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 280 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 281 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 282 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 283 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 284 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 285 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy 286 >>> len(align) 287 8 288 289 """ 290 if self._records: 291 self._append(record, self.get_alignment_length()) 292 else: 293 self._append(record)
294
295 - def _append(self, record, expected_length=None):
296 """Helper function (PRIVATE).""" 297 if not isinstance(record, SeqRecord): 298 raise TypeError("New sequence is not a SeqRecord object") 299 300 #Currently the get_alignment_length() call is expensive, so we need 301 #to avoid calling it repeatedly for __init__ and extend, hence this 302 #private _append method 303 if expected_length is not None and len(record) != expected_length: 304 #TODO - Use the following more helpful error, but update unit tests 305 #raise ValueError("New sequence is not of length %i" \ 306 # % self.get_alignment_length()) 307 raise ValueError("Sequences must all be the same length") 308 309 #Using not self.alphabet.contains(record.seq.alphabet) needs fixing 310 #for AlphabetEncoders (e.g. gapped versus ungapped). 311 if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]): 312 raise ValueError("New sequence's alphabet is incompatible") 313 self._records.append(record)
314
315 - def __add__(self, other):
316 """Combines to alignments with the same number of rows by adding them. 317 318 If you have two multiple sequence alignments (MSAs), there are two ways to think 319 about adding them - by row or by column. Using the extend method adds by row. 320 Using the addition operator adds by column. For example, 321 322 >>> from Bio.Alphabet import generic_dna 323 >>> from Bio.Seq import Seq 324 >>> from Bio.SeqRecord import SeqRecord 325 >>> from Bio.Align import MultipleSeqAlignment 326 >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha") 327 >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta") 328 >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma") 329 >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha") 330 >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta") 331 >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma") 332 >>> left = MultipleSeqAlignment([a1, b1, c1], 333 ... annotations={"tool": "demo", "name": "start"}) 334 >>> right = MultipleSeqAlignment([a2, b2, c2], 335 ... annotations={"tool": "demo", "name": "end"}) 336 337 Now, let's look at these two alignments: 338 339 >>> print left 340 DNAAlphabet() alignment with 3 rows and 5 columns 341 AAAAC Alpha 342 AAA-C Beta 343 AAAAG Gamma 344 >>> print right 345 DNAAlphabet() alignment with 3 rows and 2 columns 346 GT Alpha 347 GT Beta 348 GT Gamma 349 350 And add them: 351 352 >>> combined = left + right 353 >>> print combined 354 DNAAlphabet() alignment with 3 rows and 7 columns 355 AAAACGT Alpha 356 AAA-CGT Beta 357 AAAAGGT Gamma 358 359 For this to work, both alignments must have the same number of records (here 360 they both have 3 rows): 361 362 >>> len(left) 363 3 364 >>> len(right) 365 3 366 >>> len(combined) 367 3 368 369 The individual rows are SeqRecord objects, and these can be added together. Refer 370 to the SeqRecord documentation for details of how the annotation is handled. This 371 example is a special case in that both original alignments shared the same names, 372 meaning when the rows are added they also get the same name. 373 374 Any common annotations are preserved, but differing annotation is lost. This is 375 the same behaviour used in the SeqRecord annotations and is designed to prevent 376 accidental propagation of inappropriate values: 377 378 >>> combined.annotations 379 {'tool': 'demo'} 380 381 """ 382 if not isinstance(other, MultipleSeqAlignment): 383 raise NotImplementedError 384 if len(self) != len(other): 385 raise ValueError("When adding two alignments they must have the same length" 386 " (i.e. same number or rows)") 387 alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet]) 388 merged = (left+right for left,right in zip(self, other)) 389 # Take any common annotation: 390 annotations = dict() 391 for k, v in self.annotations.iteritems(): 392 if k in other.annotations and other.annotations[k] == v: 393 annotations[k] = v 394 return MultipleSeqAlignment(merged, alpha, annotations)
395
396 - def __getitem__(self, index):
397 """Access part of the alignment. 398 399 Depending on the indices, you can get a SeqRecord object 400 (representing a single row), a Seq object (for a single columns), 401 a string (for a single characters) or another alignment 402 (representing some part or all of the alignment). 403 404 align[r,c] gives a single character as a string 405 align[r] gives a row as a SeqRecord 406 align[r,:] gives a row as a SeqRecord 407 align[:,c] gives a column as a Seq (using the alignment's alphabet) 408 409 align[:] and align[:,:] give a copy of the alignment 410 411 Anything else gives a sub alignment, e.g. 412 align[0:2] or align[0:2,:] uses only row 0 and 1 413 align[:,1:3] uses only columns 1 and 2 414 align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2 415 416 We'll use the following example alignment here for illustration: 417 418 >>> from Bio.Alphabet import generic_dna 419 >>> from Bio.Seq import Seq 420 >>> from Bio.SeqRecord import SeqRecord 421 >>> from Bio.Align import MultipleSeqAlignment 422 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 423 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 424 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 425 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 426 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 427 >>> align = MultipleSeqAlignment([a, b, c, d, e], generic_dna) 428 429 You can access a row of the alignment as a SeqRecord using an integer 430 index (think of the alignment as a list of SeqRecord objects here): 431 432 >>> first_record = align[0] 433 >>> print first_record.id, first_record.seq 434 Alpha AAAACGT 435 >>> last_record = align[-1] 436 >>> print last_record.id, last_record.seq 437 Epsilon AAA-GGT 438 439 You can also access use python's slice notation to create a sub-alignment 440 containing only some of the SeqRecord objects: 441 442 >>> sub_alignment = align[2:5] 443 >>> print sub_alignment 444 DNAAlphabet() alignment with 3 rows and 7 columns 445 AAAAGGT Gamma 446 AAAACGT Delta 447 AAA-GGT Epsilon 448 449 This includes support for a step, i.e. align[start:end:step], which 450 can be used to select every second sequence: 451 452 >>> sub_alignment = align[::2] 453 >>> print sub_alignment 454 DNAAlphabet() alignment with 3 rows and 7 columns 455 AAAACGT Alpha 456 AAAAGGT Gamma 457 AAA-GGT Epsilon 458 459 Or to get a copy of the alignment with the rows in reverse order: 460 461 >>> rev_alignment = align[::-1] 462 >>> print rev_alignment 463 DNAAlphabet() alignment with 5 rows and 7 columns 464 AAA-GGT Epsilon 465 AAAACGT Delta 466 AAAAGGT Gamma 467 AAA-CGT Beta 468 AAAACGT Alpha 469 470 You can also use two indices to specify both rows and columns. Using simple 471 integers gives you the entry as a single character string. e.g. 472 473 >>> align[3,4] 474 'C' 475 476 This is equivalent to: 477 478 >>> align[3][4] 479 'C' 480 481 or: 482 483 >>> align[3].seq[4] 484 'C' 485 486 To get a single column (as a string) use this syntax: 487 488 >>> align[:,4] 489 'CCGCG' 490 491 Or, to get part of a column, 492 493 >>> align[1:3,4] 494 'CG' 495 496 However, in general you get a sub-alignment, 497 498 >>> print align[1:5,3:6] 499 DNAAlphabet() alignment with 4 rows and 3 columns 500 -CG Beta 501 AGG Gamma 502 ACG Delta 503 -GG Epsilon 504 505 This should all seem familiar to anyone who has used the NumPy 506 array or matrix objects. 507 """ 508 if isinstance(index, int): 509 #e.g. result = align[x] 510 #Return a SeqRecord 511 return self._records[index] 512 elif isinstance(index, slice): 513 #e.g. sub_align = align[i:j:k] 514 return MultipleSeqAlignment(self._records[index], self._alphabet) 515 elif len(index)!=2: 516 raise TypeError("Invalid index type.") 517 518 #Handle double indexing 519 row_index, col_index = index 520 if isinstance(row_index, int): 521 #e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord 522 return self._records[row_index][col_index] 523 elif isinstance(col_index, int): 524 #e.g. col_or_part_col = align[1:5, 6], gives a string 525 return "".join(rec[col_index] for rec in self._records[row_index]) 526 else: 527 #e.g. sub_align = align[1:4, 5:7], gives another alignment 528 return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]), 529 self._alphabet)
530
531 - def sort(self, key=None, reverse=False):
532 """Sort the rows (SeqRecord objects) of the alignment in place. 533 534 This sorts the rows alphabetically using the SeqRecord object id by 535 default. The sorting can be controlled by supplying a key function 536 which must map each SeqRecord to a sort value. 537 538 This is useful if you want to add two alignments which use the same 539 record identifiers, but in a different order. For example, 540 541 >>> from Bio.Alphabet import generic_dna 542 >>> from Bio.Seq import Seq 543 >>> from Bio.SeqRecord import SeqRecord 544 >>> from Bio.Align import MultipleSeqAlignment 545 >>> align1 = MultipleSeqAlignment([ 546 ... SeqRecord(Seq("ACGT", generic_dna), id="Human"), 547 ... SeqRecord(Seq("ACGG", generic_dna), id="Mouse"), 548 ... SeqRecord(Seq("ACGC", generic_dna), id="Chicken"), 549 ... ]) 550 >>> align2 = MultipleSeqAlignment([ 551 ... SeqRecord(Seq("CGGT", generic_dna), id="Mouse"), 552 ... SeqRecord(Seq("CGTT", generic_dna), id="Human"), 553 ... SeqRecord(Seq("CGCT", generic_dna), id="Chicken"), 554 ... ]) 555 556 If you simple try and add these without sorting, you get this: 557 558 >>> print align1 + align2 559 DNAAlphabet() alignment with 3 rows and 8 columns 560 ACGTCGGT <unknown id> 561 ACGGCGTT <unknown id> 562 ACGCCGCT Chicken 563 564 Consult the SeqRecord documentation which explains why you get a 565 default value when annotation like the identifier doesn't match up. 566 However, if we sort the alignments first, then add them we get the 567 desired result: 568 569 >>> align1.sort() 570 >>> align2.sort() 571 >>> print align1 + align2 572 DNAAlphabet() alignment with 3 rows and 8 columns 573 ACGCCGCT Chicken 574 ACGTCGTT Human 575 ACGGCGGT Mouse 576 577 As an example using a different sort order, you could sort on the 578 GC content of each sequence. 579 580 >>> from Bio.SeqUtils import GC 581 >>> print align1 582 DNAAlphabet() alignment with 3 rows and 4 columns 583 ACGC Chicken 584 ACGT Human 585 ACGG Mouse 586 >>> align1.sort(key = lambda record: GC(record.seq)) 587 >>> print align1 588 DNAAlphabet() alignment with 3 rows and 4 columns 589 ACGT Human 590 ACGC Chicken 591 ACGG Mouse 592 593 There is also a reverse argument, so if you wanted to sort by ID 594 but backwards: 595 596 >>> align1.sort(reverse=True) 597 >>> print align1 598 DNAAlphabet() alignment with 3 rows and 4 columns 599 ACGG Mouse 600 ACGT Human 601 ACGC Chicken 602 603 """ 604 if key is None: 605 self._records.sort(key = lambda r: r.id, reverse = reverse) 606 else: 607 self._records.sort(key = key, reverse = reverse)
608
609 - def get_column(self, col):
610 """Returns a string containing a given column (DEPRECATED). 611 612 This is a method provided for backwards compatibility with the old 613 Bio.Align.Generic.Alignment object. Please use the slice notation 614 instead, since get_column is likely to be removed in a future release 615 of Biopython.. 616 """ 617 import warnings 618 import Bio 619 warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 620 return _Alignment.get_column(self, col)
621
622 - def add_sequence(self, descriptor, sequence, start = None, end = None, 623 weight = 1.0):
624 """Add a sequence to the alignment (DEPRECATED). 625 626 The start, end, and weight arguments are not supported! This method 627 only provides limited backwards compatibility with the old 628 Bio.Align.Generic.Alignment object. Please use the append method with 629 a SeqRecord instead, since add_sequence is likely to be removed in a 630 future release of Biopython. 631 """ 632 import warnings 633 import Bio 634 warnings.warn("The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 635 #Should we handle start/end/strand information somehow? What for? 636 #TODO - Should we handle weights somehow? See also AlignInfo code... 637 if start is not None or end is not None or weight != 1.0: 638 raise ValueError("The add_Sequence method is obsolete, and only " 639 "provides limited backwards compatibily. The" 640 "start, end and weight arguments are not " 641 "supported.") 642 self.append(SeqRecord(Seq(sequence, self._alphabet), 643 id = descriptor, description = descriptor))
644 645 646 if __name__ == "__main__": 647 from Bio._utils import run_doctest 648 run_doctest() 649