Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2010 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
 13   
14 -class Alphabet(object):
15 """Generic alphabet base class. 16 17 This class is used as a base class for other types of alphabets. 18 19 Attributes: 20 letters -- list-like object containing the letters of the alphabet. 21 Usually it is a string when letters are single characters. 22 size -- size of the alphabet's letters (e.g. 1 when letters are 23 single characters). 24 25 """ 26 27 size = None # default to no fixed size for words 28 letters = None # default to no fixed alphabet 29 # In general, a list-like object. However, 30 # assuming letters are single characters, use a 31 # string. This is expected for use with Seq like 32 # objects. 33
34 - def __repr__(self):
35 return self.__class__.__name__ + "()"
36
37 - def contains(self, other):
38 """Does this alphabet 'contain' the other (OBSOLETE?). 39 40 Returns a boolean. This relies on the Alphabet subclassing 41 hierarchy only, and does not check the letters property. 42 This isn't ideal, and doesn't seem to work as intended 43 with the AlphabetEncoder classes.""" 44 return isinstance(other, self.__class__)
45
46 - def _case_less(self):
47 """Return a case-less variant of the current alphabet (PRIVATE).""" 48 #TODO - remove this method by dealing with things in subclasses? 49 if isinstance(self, ProteinAlphabet): 50 return generic_protein 51 elif isinstance(self, DNAAlphabet): 52 return generic_dna 53 elif isinstance(self, RNAAlphabet): 54 return generic_rna 55 elif isinstance(self, NucleotideAlphabet): 56 return generic_nucleotide 57 elif isinstance(self, SingleLetterAlphabet): 58 return single_letter_alphabet 59 else: 60 return generic_alphabet
61
62 - def _upper(self):
63 """Return an upper case variant of the current alphabet (PRIVATE).""" 64 if not self.letters or self.letters==self.letters.upper(): 65 #Easy case, no letters or already upper case! 66 return self 67 else: 68 #TODO - Raise NotImplementedError and handle via subclass? 69 return self._case_less()
70
71 - def _lower(self):
72 """Return a lower case variant of the current alphabet (PRIVATE).""" 73 if not self.letters or self.letters==self.letters.lower(): 74 #Easy case, no letters or already lower case! 75 return self 76 else: 77 #TODO - Raise NotImplementedError and handle via subclass? 78 return self._case_less()
79 80 generic_alphabet = Alphabet() 81 82
83 -class SingleLetterAlphabet(Alphabet):
84 """Generic alphabet with letters of size one.""" 85 size = 1 86 letters = None # string of all letters in the alphabet
87 88 single_letter_alphabet = SingleLetterAlphabet() 89 90 ########### Protein 91 92
93 -class ProteinAlphabet(SingleLetterAlphabet):
94 """Generic single letter protein alphabet.""" 95 pass
96 97 generic_protein = ProteinAlphabet() 98 99 ########### DNA 100 101
102 -class NucleotideAlphabet(SingleLetterAlphabet):
103 """Generic single letter nucleotide alphabet.""" 104 pass
105 106 generic_nucleotide = NucleotideAlphabet() 107 108
109 -class DNAAlphabet(NucleotideAlphabet):
110 """Generic single letter DNA alphabet.""" 111 pass
112 113 generic_dna = DNAAlphabet() 114 115 116 ########### RNA 117 118
119 -class RNAAlphabet(NucleotideAlphabet):
120 """Generic single letter RNA alphabet.""" 121 pass
122 123 generic_rna = RNAAlphabet() 124 125 ########### Other per-sequence encodings 126 127
128 -class SecondaryStructure(SingleLetterAlphabet):
129 """Alphabet used to describe secondary structure. 130 131 Letters are 'H' (helix), 'S' (strand), 'T' (turn) and 'C' (coil). 132 """ 133 letters = "HSTC"
134 135
136 -class ThreeLetterProtein(Alphabet):
137 """Three letter protein alphabet.""" 138 size = 3 139 letters = [ 140 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 141 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 142 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 143 ]
144 145 ###### Non per-sequence modifications 146 147 # (These are Decorator classes) 148 149
150 -class AlphabetEncoder(object):
151 - def __init__(self, alphabet, new_letters):
152 self.alphabet = alphabet 153 self.new_letters = new_letters 154 if alphabet.letters is not None: 155 self.letters = alphabet.letters + new_letters 156 else: 157 self.letters = None
158
159 - def __getattr__(self, key):
160 if key[:2] == "__" and key[-2:] == "__": 161 raise AttributeError(key) 162 return getattr(self.alphabet, key)
163
164 - def __repr__(self):
165 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 166 self.new_letters)
167
168 - def contains(self, other):
169 """Does this alphabet 'contain' the other (OBSOLETE?). 170 171 This is isn't implemented for the base AlphabetEncoder, 172 which will always return 0 (False).""" 173 return 0
174
175 - def _upper(self):
176 """Return an upper case variant of the current alphabet (PRIVATE).""" 177 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
178
179 - def _lower(self):
180 """Return a lower case variant of the current alphabet (PRIVATE).""" 181 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
182 183
184 -class Gapped(AlphabetEncoder):
185 - def __init__(self, alphabet, gap_char = "-"):
186 AlphabetEncoder.__init__(self, alphabet, gap_char) 187 self.gap_char = gap_char
188
189 - def contains(self, other):
190 """Does this alphabet 'contain' the other (OBSOLETE?). 191 192 Returns a boolean. This relies on the Alphabet subclassing 193 hierarchy, and attempts to check the gap character. This fails 194 if the other alphabet does not have a gap character! 195 """ 196 return other.gap_char == self.gap_char and \ 197 self.alphabet.contains(other.alphabet)
198
199 - def _upper(self):
200 """Return an upper case variant of the current alphabet (PRIVATE).""" 201 return Gapped(self.alphabet._upper(), self.gap_char.upper())
202
203 - def _lower(self):
204 """Return a lower case variant of the current alphabet (PRIVATE).""" 205 return Gapped(self.alphabet._lower(), self.gap_char.lower())
206 207
208 -class HasStopCodon(AlphabetEncoder):
209 - def __init__(self, alphabet, stop_symbol = "*"):
210 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 211 self.stop_symbol = stop_symbol
212
213 - def __cmp__(self, other):
214 x = cmp(self.alphabet, other.alphabet) 215 if x == 0: 216 return cmp(self.stop_symbol, other.stop_symbol) 217 return x
218
219 - def contains(self, other):
220 """Does this alphabet 'contain' the other (OBSOLETE?). 221 222 Returns a boolean. This relies on the Alphabet subclassing 223 hierarchy, and attempts to check the stop symbol. This fails 224 if the other alphabet does not have a stop symbol! 225 """ 226 return other.stop_symbol == self.stop_symbol and \ 227 self.alphabet.contains(other.alphabet)
228
229 - def _upper(self):
230 """Return an upper case variant of the current alphabet (PRIVATE).""" 231 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
232
233 - def _lower(self):
234 """Return a lower case variant of the current alphabet (PRIVATE).""" 235 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
236 237
238 -def _get_base_alphabet(alphabet):
239 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 240 a = alphabet 241 while isinstance(a, AlphabetEncoder): 242 a = a.alphabet 243 assert isinstance(a, Alphabet), \ 244 "Invalid alphabet found, %s" % repr(a) 245 return a
246 247
248 -def _ungap(alphabet):
249 """Returns the alphabet without any gap encoder (PRIVATE).""" 250 #TODO - Handle via method of the objects? 251 if not hasattr(alphabet, "gap_char"): 252 return alphabet 253 elif isinstance(alphabet, Gapped): 254 return alphabet.alphabet 255 elif isinstance(alphabet, HasStopCodon): 256 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 257 elif isinstance(alphabet, AlphabetEncoder): 258 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 259 else: 260 raise NotImplementedError
261 262
263 -def _consensus_base_alphabet(alphabets):
264 """Returns a common but often generic base alphabet object (PRIVATE). 265 266 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 267 268 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 269 letter. These DO NOT raise an exception!""" 270 common = None 271 for alpha in alphabets: 272 a = _get_base_alphabet(alpha) 273 if common is None: 274 common = a 275 elif common == a: 276 pass 277 elif isinstance(a, common.__class__): 278 pass 279 elif isinstance(common, a.__class__): 280 common = a 281 elif isinstance(a, NucleotideAlphabet) \ 282 and isinstance(common, NucleotideAlphabet): 283 #e.g. Give a mix of RNA and DNA alphabets 284 common = generic_nucleotide 285 elif isinstance(a, SingleLetterAlphabet) \ 286 and isinstance(common, SingleLetterAlphabet): 287 #This is a pretty big mis-match! 288 common = single_letter_alphabet 289 else: 290 #We have a major mis-match... take the easy way out! 291 return generic_alphabet 292 if common is None: 293 #Given NO alphabets! 294 return generic_alphabet 295 return common
296 297
298 -def _consensus_alphabet(alphabets):
299 """Returns a common but often generic alphabet object (PRIVATE). 300 301 >>> from Bio.Alphabet import IUPAC 302 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein]) 303 ExtendedIUPACProtein() 304 >>> _consensus_alphabet([generic_protein, IUPAC.protein]) 305 ProteinAlphabet() 306 307 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 308 letter. These DO NOT raise an exception! 309 310 >>> _consensus_alphabet([generic_dna, generic_nucleotide]) 311 NucleotideAlphabet() 312 >>> _consensus_alphabet([generic_dna, generic_rna]) 313 NucleotideAlphabet() 314 >>> _consensus_alphabet([generic_dna, generic_protein]) 315 SingleLetterAlphabet() 316 >>> _consensus_alphabet([single_letter_alphabet, generic_protein]) 317 SingleLetterAlphabet() 318 319 This is aware of Gapped and HasStopCodon and new letters added by 320 other AlphabetEncoders. This WILL raise an exception if more than 321 one gap character or stop symbol is present. 322 323 >>> from Bio.Alphabet import IUPAC 324 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)]) 325 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*') 326 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")]) 327 Traceback (most recent call last): 328 ... 329 ValueError: More than one gap character present 330 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")]) 331 Traceback (most recent call last): 332 ... 333 ValueError: More than one stop symbol present 334 """ 335 base = _consensus_base_alphabet(alphabets) 336 gap = None 337 stop = None 338 new_letters = "" 339 for alpha in alphabets: 340 #Gaps... 341 if not hasattr(alpha, "gap_char"): 342 pass 343 elif gap is None: 344 gap = alpha.gap_char 345 elif gap == alpha.gap_char: 346 pass 347 else: 348 raise ValueError("More than one gap character present") 349 #Stops... 350 if not hasattr(alpha, "stop_symbol"): 351 pass 352 elif stop is None: 353 stop = alpha.stop_symbol 354 elif stop == alpha.stop_symbol: 355 pass 356 else: 357 raise ValueError("More than one stop symbol present") 358 #New letters... 359 if hasattr(alpha, "new_letters"): 360 for letter in alpha.new_letters: 361 if letter not in new_letters \ 362 and letter != gap and letter != stop: 363 new_letters += letter 364 365 alpha = base 366 if new_letters: 367 alpha = AlphabetEncoder(alpha, new_letters) 368 if gap: 369 alpha = Gapped(alpha, gap_char=gap) 370 if stop: 371 alpha = HasStopCodon(alpha, stop_symbol=stop) 372 return alpha
373 374
375 -def _check_type_compatible(alphabets):
376 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 377 378 >>> _check_type_compatible([generic_dna, generic_nucleotide]) 379 True 380 >>> _check_type_compatible([generic_dna, generic_rna]) 381 False 382 >>> _check_type_compatible([generic_dna, generic_protein]) 383 False 384 >>> _check_type_compatible([single_letter_alphabet, generic_protein]) 385 True 386 387 This relies on the Alphabet subclassing hierarchy. It does not 388 check things like gap characters or stop symbols.""" 389 dna, rna, nucl, protein = False, False, False, False 390 for alpha in alphabets: 391 a = _get_base_alphabet(alpha) 392 if isinstance(a, DNAAlphabet): 393 dna = True 394 nucl = True 395 if rna or protein: 396 return False 397 elif isinstance(a, RNAAlphabet): 398 rna = True 399 nucl = True 400 if dna or protein: 401 return False 402 elif isinstance(a, NucleotideAlphabet): 403 nucl = True 404 if protein: 405 return False 406 elif isinstance(a, ProteinAlphabet): 407 protein = True 408 if nucl: 409 return False 410 return True
411 412
413 -def _verify_alphabet(sequence):
414 """Check all letters in sequence are in the alphabet (PRIVATE). 415 416 >>> from Bio.Seq import Seq 417 >>> from Bio.Alphabet import IUPAC 418 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 419 ... IUPAC.protein) 420 >>> _verify_alphabet(my_seq) 421 True 422 423 This example has an X, which is not in the IUPAC protein alphabet 424 (you should be using the IUPAC extended protein alphabet): 425 426 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX", 427 ... IUPAC.protein) 428 >>> _verify_alphabet(bad_seq) 429 False 430 431 This replaces Bio.utils.verify_alphabet() since we are deprecating 432 that. Potentially this could be added to the Alphabet object, and 433 I would like it to be an option when creating a Seq object... but 434 that might slow things down. 435 """ 436 letters = sequence.alphabet.letters 437 if not letters: 438 raise ValueError("Alphabet does not define letters.") 439 for letter in sequence: 440 if letter not in letters: 441 return False 442 return True
443