Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2010 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Classes for generic sequence alignment. 
  9   
 10  Contains classes to deal with generic sequence alignment stuff not 
 11  specific to a particular program or format. 
 12   
 13  Classes: 
 14   - Alignment 
 15  """ 
 16  from __future__ import print_function 
 17   
 18  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 19   
 20  # biopython 
 21  from Bio.Seq import Seq 
 22  from Bio.SeqRecord import SeqRecord 
 23  from Bio import Alphabet 
 24   
 25   
26 -class Alignment(object):
27 """Represent a set of alignments (DEPRECATED). 28 29 This is a base class to represent alignments, which can be subclassed 30 to deal with an alignment in a specific format. 31 32 With the introduction of the MultipleSeqAlignment class in Bio.Align, 33 this base class is deprecated and is likely to be removed in future 34 releases of Biopython. 35 """
36 - def __init__(self, alphabet):
37 """Initialize a new Alignment object. 38 39 Arguments: 40 - alphabet - The alphabet to use for the sequence objects that are 41 created. This alphabet must be a gapped type. 42 43 e.g. 44 45 >>> from Bio.Alphabet import IUPAC, Gapped 46 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 47 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 48 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 49 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 50 >>> print(align) 51 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 52 ACTGCTAGCTAG Alpha 53 ACT-CTAGCTAG Beta 54 ACTGCTAGATAG Gamma 55 """ 56 import warnings 57 import Bio 58 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 59 if not (isinstance(alphabet, Alphabet.Alphabet) 60 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 61 raise ValueError("Invalid alphabet argument") 62 self._alphabet = alphabet 63 # hold everything at a list of SeqRecord objects 64 self._records = []
65
66 - def _str_line(self, record, length=50):
67 """Returns a truncated string representation of a SeqRecord (PRIVATE). 68 69 This is a PRIVATE function used by the __str__ method. 70 """ 71 if record.seq.__class__.__name__ == "CodonSeq": 72 if len(record.seq) <= length: 73 return "%s %s" % (record.seq, record.id) 74 else: 75 return "%s...%s %s" \ 76 % (record.seq[:length-3], record.seq[-3:], record.id) 77 else: 78 if len(record.seq) <= length: 79 return "%s %s" % (record.seq, record.id) 80 else: 81 return "%s...%s %s" \ 82 % (record.seq[:length-6], record.seq[-3:], record.id)
83
84 - def __str__(self):
85 """Returns a multi-line string summary of the alignment. 86 87 This output is intended to be readable, but large alignments are 88 shown truncated. A maximum of 20 rows (sequences) and 50 columns 89 are shown, with the record identifiers. This should fit nicely on a 90 single screen. e.g. 91 92 >>> from Bio.Alphabet import IUPAC, Gapped 93 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 94 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 95 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 96 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 97 >>> print(align) 98 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 99 ACTGCTAGCTAG Alpha 100 ACT-CTAGCTAG Beta 101 ACTGCTAGATAG Gamma 102 103 See also the alignment's format method. 104 """ 105 rows = len(self._records) 106 lines = ["%s alignment with %i rows and %i columns" 107 % (str(self._alphabet), rows, self.get_alignment_length())] 108 if rows <= 20: 109 lines.extend(self._str_line(rec) for rec in self._records) 110 else: 111 lines.extend(self._str_line(rec) for rec in self._records[:18]) 112 lines.append("...") 113 lines.append(self._str_line(self._records[-1])) 114 return "\n".join(lines)
115
116 - def __repr__(self):
117 """Returns a representation of the object for debugging. 118 119 The representation cannot be used with eval() to recreate the object, 120 which is usually possible with simple python ojects. For example: 121 122 <Bio.Align.Generic.Alignment instance (2 records of length 14, 123 SingleLetterAlphabet()) at a3c184c> 124 125 The hex string is the memory address of the object, see help(id). 126 This provides a simple way to visually distinguish alignments of 127 the same size. 128 """ 129 #A doctest for __repr__ would be nice, but __class__ comes out differently 130 #if run via the __main__ trick. 131 return "<%s instance (%i records of length %i, %s) at %x>" % \ 132 (self.__class__, len(self._records), 133 self.get_alignment_length(), repr(self._alphabet), id(self))
134 #This version is useful for doing eval(repr(alignment)), 135 #but it can be VERY long: 136 #return "%s(%s, %s)" \ 137 # % (self.__class__, repr(self._records), repr(self._alphabet)) 138
139 - def format(self, format):
140 """Returns the alignment as a string in the specified file format. 141 142 The format should be a lower case string supported as an output 143 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 144 "stockholm", etc), which is used to turn the alignment into a 145 string. 146 147 e.g. 148 149 >>> from Bio.Alphabet import IUPAC, Gapped 150 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 151 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 152 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 153 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 154 >>> print(align.format("fasta")) 155 >Alpha 156 ACTGCTAGCTAG 157 >Beta 158 ACT-CTAGCTAG 159 >Gamma 160 ACTGCTAGATAG 161 <BLANKLINE> 162 >>> print(align.format("phylip")) 163 3 12 164 Alpha ACTGCTAGCT AG 165 Beta ACT-CTAGCT AG 166 Gamma ACTGCTAGAT AG 167 <BLANKLINE> 168 169 For Python 2.6, 3.0 or later see also the built in format() function. 170 """ 171 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 172 #See also the SeqRecord class and its format() method using Bio.SeqIO 173 return self.__format__(format)
174
175 - def __format__(self, format_spec):
176 """Returns the alignment as a string in the specified file format. 177 178 This method supports the python format() function added in 179 Python 2.6/3.0. The format_spec should be a lower case 180 string supported by Bio.AlignIO as an output file format. 181 See also the alignment's format() method.""" 182 if format_spec: 183 from Bio._py3k import StringIO 184 from Bio import AlignIO 185 handle = StringIO() 186 AlignIO.write([self], handle, format_spec) 187 return handle.getvalue() 188 else: 189 #Follow python convention and default to using __str__ 190 return str(self)
191
192 - def get_all_seqs(self):
193 """Return all of the sequences involved in the alignment (DEPRECATED). 194 195 The return value is a list of SeqRecord objects. 196 197 This method is deprecated, as the Alignment object itself now offers 198 much of the functionality of a list of SeqRecord objects (e.g. 199 iteration or slicing to create a sub-alignment). Instead use the 200 Python builtin function list, i.e. my_list = list(my_align) 201 """ 202 import warnings 203 import Bio 204 warnings.warn("This method is deprecated, since the alignment object" 205 "now acts more like a list. Instead of calling " 206 "align.get_all_seqs() you can use list(align)", 207 Bio.BiopythonDeprecationWarning) 208 return self._records
209
210 - def __iter__(self):
211 """Iterate over alignment rows as SeqRecord objects. 212 213 e.g. 214 215 >>> from Bio.Alphabet import IUPAC, Gapped 216 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 217 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 218 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 219 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 220 >>> for record in align: 221 ... print(record.id) 222 ... print(record.seq) 223 Alpha 224 ACTGCTAGCTAG 225 Beta 226 ACT-CTAGCTAG 227 Gamma 228 ACTGCTAGATAG 229 """ 230 return iter(self._records)
231
232 - def get_seq_by_num(self, number):
233 """Retrieve a sequence by row number (DEPRECATED). 234 235 Returns: 236 - A Seq object for the requested sequence. 237 238 Raises: 239 - IndexError - If the specified number is out of range. 240 241 NOTE: This is a legacy method. In new code where you need to access 242 the rows of the alignment (i.e. the sequences) consider iterating 243 over them or accessing them as SeqRecord objects. 244 """ 245 import warnings 246 import Bio 247 warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning) 248 return self._records[number].seq
249
250 - def __len__(self):
251 """Returns the number of sequences in the alignment. 252 253 Use len(alignment) to get the number of sequences (i.e. the number of 254 rows), and alignment.get_alignment_length() to get the length of the 255 longest sequence (i.e. the number of columns). 256 257 This is easy to remember if you think of the alignment as being like a 258 list of SeqRecord objects. 259 """ 260 return len(self._records)
261
262 - def get_alignment_length(self):
263 """Return the maximum length of the alignment. 264 265 All objects in the alignment should (hopefully) have the same 266 length. This function will go through and find this length 267 by finding the maximum length of sequences in the alignment. 268 269 >>> from Bio.Alphabet import IUPAC, Gapped 270 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 271 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 272 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 273 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 274 >>> align.get_alignment_length() 275 12 276 277 If you want to know the number of sequences in the alignment, 278 use len(align) instead: 279 280 >>> len(align) 281 3 282 283 """ 284 max_length = 0 285 286 for record in self._records: 287 if len(record.seq) > max_length: 288 max_length = len(record.seq) 289 290 return max_length
291
292 - def add_sequence(self, descriptor, sequence, start = None, end = None, 293 weight = 1.0):
294 """Add a sequence to the alignment. 295 296 This doesn't do any kind of alignment, it just adds in the sequence 297 object, which is assumed to be prealigned with the existing 298 sequences. 299 300 Arguments: 301 - descriptor - The descriptive id of the sequence being added. 302 This will be used as the resulting SeqRecord's 303 .id property (and, for historical compatibility, 304 also the .description property) 305 - sequence - A string with sequence info. 306 - start - You can explicitly set the start point of the sequence. 307 This is useful (at least) for BLAST alignments, which can 308 just be partial alignments of sequences. 309 - end - Specify the end of the sequence, which is important 310 for the same reason as the start. 311 - weight - The weight to place on the sequence in the alignment. 312 By default, all sequences have the same weight. (0.0 => 313 no weight, 1.0 => highest weight) 314 """ 315 new_seq = Seq(sequence, self._alphabet) 316 317 #We are now effectively using the SeqRecord's .id as 318 #the primary identifier (e.g. in Bio.SeqIO) so we should 319 #populate it with the descriptor. 320 #For backwards compatibility, also store this in the 321 #SeqRecord's description property. 322 new_record = SeqRecord(new_seq, 323 id = descriptor, 324 description = descriptor) 325 326 # hack! We really need to work out how to deal with annotations 327 # and features in biopython. Right now, I'll just use the 328 # generic annotations dictionary we've got to store the start 329 # and end, but we should think up something better. I don't know 330 # if I'm really a big fan of the LocatableSeq thing they've got 331 # in BioPerl, but I'm not positive what the best thing to do on 332 # this is... 333 if start: 334 new_record.annotations['start'] = start 335 if end: 336 new_record.annotations['end'] = end 337 338 # another hack to add weight information to the sequence 339 new_record.annotations['weight'] = weight 340 341 self._records.append(new_record)
342
343 - def get_column(self, col):
344 """Returns a string containing a given column. 345 346 e.g. 347 348 >>> from Bio.Alphabet import IUPAC, Gapped 349 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 350 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 351 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 352 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 353 >>> align.get_column(0) 354 'AAA' 355 >>> align.get_column(3) 356 'G-G' 357 """ 358 #TODO - Support negative indices? 359 col_str = '' 360 assert col >= 0 and col <= self.get_alignment_length() 361 for rec in self._records: 362 col_str += rec.seq[col] 363 return col_str
364
365 - def __getitem__(self, index):
366 """Access part of the alignment. 367 368 We'll use the following example alignment here for illustration: 369 370 >>> from Bio.Alphabet import IUPAC, Gapped 371 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 372 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 373 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 374 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 375 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 376 >>> align.add_sequence("Epsilon", "ACTGCTTGATAG") 377 378 You can access a row of the alignment as a SeqRecord using an integer 379 index (think of the alignment as a list of SeqRecord objects here): 380 381 >>> first_record = align[0] 382 >>> print("%s %s" % (first_record.id, first_record.seq)) 383 Alpha ACTGCTAGCTAG 384 >>> last_record = align[-1] 385 >>> print("%s %s" % (last_record.id, last_record.seq)) 386 Epsilon ACTGCTTGATAG 387 388 You can also access use python's slice notation to create a sub-alignment 389 containing only some of the SeqRecord objects: 390 391 >>> sub_alignment = align[2:5] 392 >>> print(sub_alignment) 393 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 394 ACTGCTAGATAG Gamma 395 ACTGCTTGCTAG Delta 396 ACTGCTTGATAG Epsilon 397 398 This includes support for a step, i.e. align[start:end:step], which 399 can be used to select every second sequence: 400 401 >>> sub_alignment = align[::2] 402 >>> print(sub_alignment) 403 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 404 ACTGCTAGCTAG Alpha 405 ACTGCTAGATAG Gamma 406 ACTGCTTGATAG Epsilon 407 408 Or to get a copy of the alignment with the rows in reverse order: 409 410 >>> rev_alignment = align[::-1] 411 >>> print(rev_alignment) 412 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 413 ACTGCTTGATAG Epsilon 414 ACTGCTTGCTAG Delta 415 ACTGCTAGATAG Gamma 416 ACT-CTAGCTAG Beta 417 ACTGCTAGCTAG Alpha 418 419 Right now, these are the ONLY indexing operations supported. The use of 420 a second column based index is under discussion for a future update. 421 """ 422 if isinstance(index, int): 423 #e.g. result = align[x] 424 #Return a SeqRecord 425 return self._records[index] 426 elif isinstance(index, slice): 427 #e.g. sub_aling = align[i:j:k] 428 #Return a new Alignment using only the specified records. 429 #TODO - See Bug 2554 for changing the __init__ method 430 #to allow us to do this more cleanly. 431 sub_align = Alignment(self._alphabet) 432 sub_align._records = self._records[index] 433 return sub_align 434 elif len(index)==2: 435 raise TypeError("Row and Column indexing is not currently supported," 436 +"but may be in future.") 437 else: 438 raise TypeError("Invalid index type.")
439 440
441 -def _test():
442 """Run the Bio.Align.Generic module's doctests.""" 443 print("Running doctests...") 444 import doctest 445 doctest.testmod() 446 print("Done")
447 448 if __name__ == "__main__": 449 _test() 450