Package Bio :: Package Align
[hide private]
[frames] | no frames]

Source Code for Package Bio.Align

  1  # Copyright 2008-2011 by Peter Cock. 
  2  # All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Code for dealing with sequence alignments. 
  7   
  8  One of the most important things in this module is the MultipleSeqAlignment 
  9  class, used in the Bio.AlignIO module. 
 10   
 11  """ 
 12  from __future__ import print_function 
 13   
 14  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 15   
 16  from Bio.Seq import Seq 
 17  from Bio.SeqRecord import SeqRecord 
 18  from Bio import Alphabet 
 19   
 20  #We only import this and subclass it for some limited backward compatibility. 
 21  from Bio.Align.Generic import Alignment as _Alignment 
 22   
 23   
24 -class MultipleSeqAlignment(_Alignment):
25 """Represents a classical multiple sequence alignment (MSA). 26 27 By this we mean a collection of sequences (usually shown as rows) which 28 are all the same length (usually with gap characters for insertions or 29 padding). The data can then be regarded as a matrix of letters, with well 30 defined columns. 31 32 You would typically create an MSA by loading an alignment file with the 33 AlignIO module: 34 35 >>> from Bio import AlignIO 36 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 37 >>> print(align) 38 SingleLetterAlphabet() alignment with 7 rows and 156 columns 39 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 40 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 41 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 42 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 43 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 44 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 45 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 46 47 In some respects you can treat these objects as lists of SeqRecord objects, 48 each representing a row of the alignment. Iterating over an alignment gives 49 the SeqRecord object for each row: 50 51 >>> len(align) 52 7 53 >>> for record in align: 54 ... print("%s %i" % (record.id, len(record))) 55 gi|6273285|gb|AF191659.1|AF191 156 56 gi|6273284|gb|AF191658.1|AF191 156 57 gi|6273287|gb|AF191661.1|AF191 156 58 gi|6273286|gb|AF191660.1|AF191 156 59 gi|6273290|gb|AF191664.1|AF191 156 60 gi|6273289|gb|AF191663.1|AF191 156 61 gi|6273291|gb|AF191665.1|AF191 156 62 63 You can also access individual rows as SeqRecord objects via their index: 64 65 >>> print(align[0].id) 66 gi|6273285|gb|AF191659.1|AF191 67 >>> print(align[-1].id) 68 gi|6273291|gb|AF191665.1|AF191 69 70 And extract columns as strings: 71 72 >>> print(align[:, 1]) 73 AAAAAAA 74 75 Or, take just the first ten columns as a sub-alignment: 76 77 >>> print(align[:, :10]) 78 SingleLetterAlphabet() alignment with 7 rows and 10 columns 79 TATACATTAA gi|6273285|gb|AF191659.1|AF191 80 TATACATTAA gi|6273284|gb|AF191658.1|AF191 81 TATACATTAA gi|6273287|gb|AF191661.1|AF191 82 TATACATAAA gi|6273286|gb|AF191660.1|AF191 83 TATACATTAA gi|6273290|gb|AF191664.1|AF191 84 TATACATTAA gi|6273289|gb|AF191663.1|AF191 85 TATACATTAA gi|6273291|gb|AF191665.1|AF191 86 87 Combining this alignment slicing with alignment addition allows you to 88 remove a section of the alignment. For example, taking just the first 89 and last ten columns: 90 91 >>> print(align[:, :10] + align[:, -10:]) 92 SingleLetterAlphabet() alignment with 7 rows and 20 columns 93 TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191 94 TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191 95 TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191 96 TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191 97 TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191 98 TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191 99 TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191 100 101 Note - This object is intended to replace the existing Alignment object 102 defined in module Bio.Align.Generic but is not fully backwards compatible 103 with it. 104 105 Note - This object does NOT attempt to model the kind of alignments used 106 in next generation sequencing with multiple sequencing reads which are 107 much shorter than the alignment, and where there is usually a consensus or 108 reference sequence with special status. 109 """ 110
111 - def __init__(self, records, alphabet=None, 112 annotations=None):
113 """Initialize a new MultipleSeqAlignment object. 114 115 Arguments: 116 - records - A list (or iterator) of SeqRecord objects, whose 117 sequences are all the same length. This may be an be an 118 empty list. 119 - alphabet - The alphabet for the whole alignment, typically a gapped 120 alphabet, which should be a super-set of the individual 121 record alphabets. If omitted, a consensus alphabet is 122 used. 123 - annotations - Information about the whole alignment (dictionary). 124 125 You would normally load a MSA from a file using Bio.AlignIO, but you 126 can do this from a list of SeqRecord objects too: 127 128 >>> from Bio.Alphabet import generic_dna 129 >>> from Bio.Seq import Seq 130 >>> from Bio.SeqRecord import SeqRecord 131 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 132 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 133 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 134 >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"}) 135 >>> print(align) 136 DNAAlphabet() alignment with 3 rows and 7 columns 137 AAAACGT Alpha 138 AAA-CGT Beta 139 AAAAGGT Gamma 140 >>> align.annotations 141 {'tool': 'demo'} 142 143 NOTE - The older Bio.Align.Generic.Alignment class only accepted a 144 single argument, an alphabet. This is still supported via a backwards 145 compatible "hack" so as not to disrupt existing scripts and users, but 146 is deprecated and will be removed in a future release. 147 """ 148 if isinstance(records, Alphabet.Alphabet) \ 149 or isinstance(records, Alphabet.AlphabetEncoder): 150 if alphabet is None: 151 #TODO - Remove this backwards compatible mode! 152 alphabet = records 153 records = [] 154 import warnings 155 from Bio import BiopythonDeprecationWarning 156 warnings.warn("Invalid records argument: While the old " 157 "Bio.Align.Generic.Alignment class only " 158 "accepted a single argument (the alphabet), the " 159 "newer Bio.Align.MultipleSeqAlignment class " 160 "expects a list/iterator of SeqRecord objects " 161 "(which can be an empty list) and an optional " 162 "alphabet argument", BiopythonDeprecationWarning) 163 else : 164 raise ValueError("Invalid records argument") 165 if alphabet is not None : 166 if not (isinstance(alphabet, Alphabet.Alphabet) 167 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 168 raise ValueError("Invalid alphabet argument") 169 self._alphabet = alphabet 170 else : 171 #Default while we add sequences, will take a consensus later 172 self._alphabet = Alphabet.single_letter_alphabet 173 174 self._records = [] 175 if records: 176 self.extend(records) 177 if alphabet is None: 178 #No alphabet was given, take a consensus alphabet 179 self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for 180 rec in self._records 181 if rec.seq is not None) 182 183 # Annotations about the whole alignment 184 if annotations is None: 185 annotations = {} 186 elif not isinstance(annotations, dict): 187 raise TypeError("annotations argument should be a dict") 188 self.annotations = annotations
189
190 - def extend(self, records):
191 """Add more SeqRecord objects to the alignment as rows. 192 193 They must all have the same length as the original alignment, and have 194 alphabets compatible with the alignment's alphabet. For example, 195 196 >>> from Bio.Alphabet import generic_dna 197 >>> from Bio.Seq import Seq 198 >>> from Bio.SeqRecord import SeqRecord 199 >>> from Bio.Align import MultipleSeqAlignment 200 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 201 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 202 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 203 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 204 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 205 206 First we create a small alignment (three rows): 207 208 >>> align = MultipleSeqAlignment([a, b, c]) 209 >>> print(align) 210 DNAAlphabet() alignment with 3 rows and 7 columns 211 AAAACGT Alpha 212 AAA-CGT Beta 213 AAAAGGT Gamma 214 215 Now we can extend this alignment with another two rows: 216 217 >>> align.extend([d, e]) 218 >>> print(align) 219 DNAAlphabet() alignment with 5 rows and 7 columns 220 AAAACGT Alpha 221 AAA-CGT Beta 222 AAAAGGT Gamma 223 AAAACGT Delta 224 AAA-GGT Epsilon 225 226 Because the alignment object allows iteration over the rows as 227 SeqRecords, you can use the extend method with a second alignment 228 (provided its sequences have the same length as the original alignment). 229 """ 230 if len(self): 231 #Use the standard method to get the length 232 expected_length = self.get_alignment_length() 233 else: 234 #Take the first record's length 235 records = iter(records) # records arg could be list or iterator 236 try: 237 rec = next(records) 238 except StopIteration: 239 #Special case, no records 240 return 241 expected_length = len(rec) 242 self._append(rec, expected_length) 243 #Now continue to the rest of the records as usual 244 245 for rec in records: 246 self._append(rec, expected_length)
247
248 - def append(self, record):
249 """Add one more SeqRecord object to the alignment as a new row. 250 251 This must have the same length as the original alignment (unless this is 252 the first record), and have an alphabet compatible with the alignment's 253 alphabet. 254 255 >>> from Bio import AlignIO 256 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 257 >>> print(align) 258 SingleLetterAlphabet() alignment with 7 rows and 156 columns 259 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 260 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 261 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 262 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 263 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 264 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 265 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 266 >>> len(align) 267 7 268 269 We'll now construct a dummy record to append as an example: 270 271 >>> from Bio.Seq import Seq 272 >>> from Bio.SeqRecord import SeqRecord 273 >>> dummy = SeqRecord(Seq("N"*156), id="dummy") 274 275 Now append this to the alignment, 276 277 >>> align.append(dummy) 278 >>> print(align) 279 SingleLetterAlphabet() alignment with 8 rows and 156 columns 280 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 281 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 282 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 283 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 284 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 285 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 286 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 287 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy 288 >>> len(align) 289 8 290 291 """ 292 if self._records: 293 self._append(record, self.get_alignment_length()) 294 else: 295 self._append(record)
296
297 - def _append(self, record, expected_length=None):
298 """Helper function (PRIVATE).""" 299 if not isinstance(record, SeqRecord): 300 raise TypeError("New sequence is not a SeqRecord object") 301 302 #Currently the get_alignment_length() call is expensive, so we need 303 #to avoid calling it repeatedly for __init__ and extend, hence this 304 #private _append method 305 if expected_length is not None and len(record) != expected_length: 306 #TODO - Use the following more helpful error, but update unit tests 307 #raise ValueError("New sequence is not of length %i" \ 308 # % self.get_alignment_length()) 309 raise ValueError("Sequences must all be the same length") 310 311 #Using not self.alphabet.contains(record.seq.alphabet) needs fixing 312 #for AlphabetEncoders (e.g. gapped versus ungapped). 313 if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]): 314 raise ValueError("New sequence's alphabet is incompatible") 315 self._records.append(record)
316
317 - def __add__(self, other):
318 """Combines to alignments with the same number of rows by adding them. 319 320 If you have two multiple sequence alignments (MSAs), there are two ways to think 321 about adding them - by row or by column. Using the extend method adds by row. 322 Using the addition operator adds by column. For example, 323 324 >>> from Bio.Alphabet import generic_dna 325 >>> from Bio.Seq import Seq 326 >>> from Bio.SeqRecord import SeqRecord 327 >>> from Bio.Align import MultipleSeqAlignment 328 >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha") 329 >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta") 330 >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma") 331 >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha") 332 >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta") 333 >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma") 334 >>> left = MultipleSeqAlignment([a1, b1, c1], 335 ... annotations={"tool": "demo", "name": "start"}) 336 >>> right = MultipleSeqAlignment([a2, b2, c2], 337 ... annotations={"tool": "demo", "name": "end"}) 338 339 Now, let's look at these two alignments: 340 341 >>> print(left) 342 DNAAlphabet() alignment with 3 rows and 5 columns 343 AAAAC Alpha 344 AAA-C Beta 345 AAAAG Gamma 346 >>> print(right) 347 DNAAlphabet() alignment with 3 rows and 2 columns 348 GT Alpha 349 GT Beta 350 GT Gamma 351 352 And add them: 353 354 >>> combined = left + right 355 >>> print(combined) 356 DNAAlphabet() alignment with 3 rows and 7 columns 357 AAAACGT Alpha 358 AAA-CGT Beta 359 AAAAGGT Gamma 360 361 For this to work, both alignments must have the same number of records (here 362 they both have 3 rows): 363 364 >>> len(left) 365 3 366 >>> len(right) 367 3 368 >>> len(combined) 369 3 370 371 The individual rows are SeqRecord objects, and these can be added together. Refer 372 to the SeqRecord documentation for details of how the annotation is handled. This 373 example is a special case in that both original alignments shared the same names, 374 meaning when the rows are added they also get the same name. 375 376 Any common annotations are preserved, but differing annotation is lost. This is 377 the same behaviour used in the SeqRecord annotations and is designed to prevent 378 accidental propagation of inappropriate values: 379 380 >>> combined.annotations 381 {'tool': 'demo'} 382 383 """ 384 if not isinstance(other, MultipleSeqAlignment): 385 raise NotImplementedError 386 if len(self) != len(other): 387 raise ValueError("When adding two alignments they must have the same length" 388 " (i.e. same number or rows)") 389 alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet]) 390 merged = (left+right for left, right in zip(self, other)) 391 # Take any common annotation: 392 annotations = dict() 393 for k, v in self.annotations.items(): 394 if k in other.annotations and other.annotations[k] == v: 395 annotations[k] = v 396 return MultipleSeqAlignment(merged, alpha, annotations)
397
398 - def __getitem__(self, index):
399 """Access part of the alignment. 400 401 Depending on the indices, you can get a SeqRecord object 402 (representing a single row), a Seq object (for a single columns), 403 a string (for a single characters) or another alignment 404 (representing some part or all of the alignment). 405 406 align[r,c] gives a single character as a string 407 align[r] gives a row as a SeqRecord 408 align[r,:] gives a row as a SeqRecord 409 align[:,c] gives a column as a Seq (using the alignment's alphabet) 410 411 align[:] and align[:,:] give a copy of the alignment 412 413 Anything else gives a sub alignment, e.g. 414 align[0:2] or align[0:2,:] uses only row 0 and 1 415 align[:,1:3] uses only columns 1 and 2 416 align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2 417 418 We'll use the following example alignment here for illustration: 419 420 >>> from Bio.Alphabet import generic_dna 421 >>> from Bio.Seq import Seq 422 >>> from Bio.SeqRecord import SeqRecord 423 >>> from Bio.Align import MultipleSeqAlignment 424 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 425 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 426 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 427 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 428 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 429 >>> align = MultipleSeqAlignment([a, b, c, d, e], generic_dna) 430 431 You can access a row of the alignment as a SeqRecord using an integer 432 index (think of the alignment as a list of SeqRecord objects here): 433 434 >>> first_record = align[0] 435 >>> print("%s %s" % (first_record.id, first_record.seq)) 436 Alpha AAAACGT 437 >>> last_record = align[-1] 438 >>> print("%s %s" % (last_record.id, last_record.seq)) 439 Epsilon AAA-GGT 440 441 You can also access use python's slice notation to create a sub-alignment 442 containing only some of the SeqRecord objects: 443 444 >>> sub_alignment = align[2:5] 445 >>> print(sub_alignment) 446 DNAAlphabet() alignment with 3 rows and 7 columns 447 AAAAGGT Gamma 448 AAAACGT Delta 449 AAA-GGT Epsilon 450 451 This includes support for a step, i.e. align[start:end:step], which 452 can be used to select every second sequence: 453 454 >>> sub_alignment = align[::2] 455 >>> print(sub_alignment) 456 DNAAlphabet() alignment with 3 rows and 7 columns 457 AAAACGT Alpha 458 AAAAGGT Gamma 459 AAA-GGT Epsilon 460 461 Or to get a copy of the alignment with the rows in reverse order: 462 463 >>> rev_alignment = align[::-1] 464 >>> print(rev_alignment) 465 DNAAlphabet() alignment with 5 rows and 7 columns 466 AAA-GGT Epsilon 467 AAAACGT Delta 468 AAAAGGT Gamma 469 AAA-CGT Beta 470 AAAACGT Alpha 471 472 You can also use two indices to specify both rows and columns. Using simple 473 integers gives you the entry as a single character string. e.g. 474 475 >>> align[3, 4] 476 'C' 477 478 This is equivalent to: 479 480 >>> align[3][4] 481 'C' 482 483 or: 484 485 >>> align[3].seq[4] 486 'C' 487 488 To get a single column (as a string) use this syntax: 489 490 >>> align[:, 4] 491 'CCGCG' 492 493 Or, to get part of a column, 494 495 >>> align[1:3, 4] 496 'CG' 497 498 However, in general you get a sub-alignment, 499 500 >>> print(align[1:5, 3:6]) 501 DNAAlphabet() alignment with 4 rows and 3 columns 502 -CG Beta 503 AGG Gamma 504 ACG Delta 505 -GG Epsilon 506 507 This should all seem familiar to anyone who has used the NumPy 508 array or matrix objects. 509 """ 510 if isinstance(index, int): 511 #e.g. result = align[x] 512 #Return a SeqRecord 513 return self._records[index] 514 elif isinstance(index, slice): 515 #e.g. sub_align = align[i:j:k] 516 return MultipleSeqAlignment(self._records[index], self._alphabet) 517 elif len(index)!=2: 518 raise TypeError("Invalid index type.") 519 520 #Handle double indexing 521 row_index, col_index = index 522 if isinstance(row_index, int): 523 #e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord 524 return self._records[row_index][col_index] 525 elif isinstance(col_index, int): 526 #e.g. col_or_part_col = align[1:5, 6], gives a string 527 return "".join(rec[col_index] for rec in self._records[row_index]) 528 else: 529 #e.g. sub_align = align[1:4, 5:7], gives another alignment 530 return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]), 531 self._alphabet)
532
533 - def sort(self, key=None, reverse=False):
534 """Sort the rows (SeqRecord objects) of the alignment in place. 535 536 This sorts the rows alphabetically using the SeqRecord object id by 537 default. The sorting can be controlled by supplying a key function 538 which must map each SeqRecord to a sort value. 539 540 This is useful if you want to add two alignments which use the same 541 record identifiers, but in a different order. For example, 542 543 >>> from Bio.Alphabet import generic_dna 544 >>> from Bio.Seq import Seq 545 >>> from Bio.SeqRecord import SeqRecord 546 >>> from Bio.Align import MultipleSeqAlignment 547 >>> align1 = MultipleSeqAlignment([ 548 ... SeqRecord(Seq("ACGT", generic_dna), id="Human"), 549 ... SeqRecord(Seq("ACGG", generic_dna), id="Mouse"), 550 ... SeqRecord(Seq("ACGC", generic_dna), id="Chicken"), 551 ... ]) 552 >>> align2 = MultipleSeqAlignment([ 553 ... SeqRecord(Seq("CGGT", generic_dna), id="Mouse"), 554 ... SeqRecord(Seq("CGTT", generic_dna), id="Human"), 555 ... SeqRecord(Seq("CGCT", generic_dna), id="Chicken"), 556 ... ]) 557 558 If you simple try and add these without sorting, you get this: 559 560 >>> print(align1 + align2) 561 DNAAlphabet() alignment with 3 rows and 8 columns 562 ACGTCGGT <unknown id> 563 ACGGCGTT <unknown id> 564 ACGCCGCT Chicken 565 566 Consult the SeqRecord documentation which explains why you get a 567 default value when annotation like the identifier doesn't match up. 568 However, if we sort the alignments first, then add them we get the 569 desired result: 570 571 >>> align1.sort() 572 >>> align2.sort() 573 >>> print(align1 + align2) 574 DNAAlphabet() alignment with 3 rows and 8 columns 575 ACGCCGCT Chicken 576 ACGTCGTT Human 577 ACGGCGGT Mouse 578 579 As an example using a different sort order, you could sort on the 580 GC content of each sequence. 581 582 >>> from Bio.SeqUtils import GC 583 >>> print(align1) 584 DNAAlphabet() alignment with 3 rows and 4 columns 585 ACGC Chicken 586 ACGT Human 587 ACGG Mouse 588 >>> align1.sort(key = lambda record: GC(record.seq)) 589 >>> print(align1) 590 DNAAlphabet() alignment with 3 rows and 4 columns 591 ACGT Human 592 ACGC Chicken 593 ACGG Mouse 594 595 There is also a reverse argument, so if you wanted to sort by ID 596 but backwards: 597 598 >>> align1.sort(reverse=True) 599 >>> print(align1) 600 DNAAlphabet() alignment with 3 rows and 4 columns 601 ACGG Mouse 602 ACGT Human 603 ACGC Chicken 604 605 """ 606 if key is None: 607 self._records.sort(key = lambda r: r.id, reverse = reverse) 608 else: 609 self._records.sort(key = key, reverse = reverse)
610
611 - def get_column(self, col):
612 """Returns a string containing a given column (DEPRECATED). 613 614 This is a method provided for backwards compatibility with the old 615 Bio.Align.Generic.Alignment object. Please use the slice notation 616 instead, since get_column is likely to be removed in a future release 617 of Biopython.. 618 """ 619 import warnings 620 import Bio 621 warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 622 return _Alignment.get_column(self, col)
623
624 - def add_sequence(self, descriptor, sequence, start = None, end = None, 625 weight = 1.0):
626 """Add a sequence to the alignment (DEPRECATED). 627 628 The start, end, and weight arguments are not supported! This method 629 only provides limited backwards compatibility with the old 630 Bio.Align.Generic.Alignment object. Please use the append method with 631 a SeqRecord instead, since add_sequence is likely to be removed in a 632 future release of Biopython. 633 """ 634 import warnings 635 import Bio 636 warnings.warn("The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 637 #Should we handle start/end/strand information somehow? What for? 638 #TODO - Should we handle weights somehow? See also AlignInfo code... 639 if start is not None or end is not None or weight != 1.0: 640 raise ValueError("The add_Sequence method is obsolete, and only " 641 "provides limited backwards compatibily. The" 642 "start, end and weight arguments are not " 643 "supported.") 644 self.append(SeqRecord(Seq(sequence, self._alphabet), 645 id = descriptor, description = descriptor))
646 647 648 if __name__ == "__main__": 649 from Bio._utils import run_doctest 650 run_doctest() 651