Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2010 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
 13   
14 -class Alphabet(object):
15 size = None # default to no fixed size for words 16 letters = None # default to no fixed alphabet 17 # In general, a list-like object. However, 18 # assuming letters are single characters, use a 19 # string. This is expected for use with Seq like 20 # objects. 21
22 - def __repr__(self):
23 return self.__class__.__name__ + "()"
24
25 - def contains(self, other):
26 """Does this alphabet 'contain' the other (OBSOLETE?). 27 28 Returns a boolean. This relies on the Alphabet subclassing 29 hierarchy only, and does not check the letters property. 30 This isn't ideal, and doesn't seem to work as intended 31 with the AlphabetEncoder classes.""" 32 return isinstance(other, self.__class__)
33
34 - def _case_less(self):
35 """Return an case-less variant of the current alphabet (PRIVATE).""" 36 #TODO - remove this method by dealing with things in subclasses? 37 if isinstance(self, ProteinAlphabet): 38 return generic_protein 39 elif isinstance(self, DNAAlphabet): 40 return generic_dna 41 elif isinstance(self, NucleotideAlphabet): 42 return generic_rna 43 elif isinstance(self, NucleotideAlphabet): 44 return generic_nucleotide 45 elif isinstance(self, SingleLetterAlphabet): 46 return single_letter_alphabet 47 else: 48 return generic_alphabet
49
50 - def _upper(self):
51 """Return an upper case variant of the current alphabet (PRIVATE).""" 52 if not self.letters or self.letters==self.letters.upper(): 53 #Easy case, no letters or already upper case! 54 return self 55 else: 56 #TODO - Raise NotImplementedError and handle via subclass? 57 return self._case_less()
58
59 - def _lower(self):
60 """Return a lower case variant of the current alphabet (PRIVATE).""" 61 if not self.letters or self.letters==self.letters.lower(): 62 #Easy case, no letters or already lower case! 63 return self 64 else: 65 #TODO - Raise NotImplementedError and handle via subclass? 66 return self._case_less()
67 68 generic_alphabet = Alphabet() 69 70
71 -class SingleLetterAlphabet(Alphabet):
72 size = 1 73 letters = None # string of all letters in the alphabet
74 75 single_letter_alphabet = SingleLetterAlphabet() 76 77 ########### Protein 78 79
80 -class ProteinAlphabet(SingleLetterAlphabet):
81 pass
82 83 generic_protein = ProteinAlphabet() 84 85 ########### DNA 86 87
88 -class NucleotideAlphabet(SingleLetterAlphabet):
89 pass
90 91 generic_nucleotide = NucleotideAlphabet() 92 93
94 -class DNAAlphabet(NucleotideAlphabet):
95 pass
96 97 generic_dna = DNAAlphabet() 98 99 100 ########### RNA 101 102
103 -class RNAAlphabet(NucleotideAlphabet):
104 pass
105 106 generic_rna = RNAAlphabet() 107 108 ########### Other per-sequence encodings 109 110
111 -class SecondaryStructure(SingleLetterAlphabet):
112 letters = "HSTC"
113 114
115 -class ThreeLetterProtein(Alphabet):
116 size = 3 117 letters = [ 118 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 119 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 120 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 121 ]
122 123 ###### Non per-sequence modifications 124 125 # (These are Decorator classes) 126 127
128 -class AlphabetEncoder(object):
129 - def __init__(self, alphabet, new_letters):
130 self.alphabet = alphabet 131 self.new_letters = new_letters 132 if alphabet.letters is not None: 133 self.letters = alphabet.letters + new_letters 134 else: 135 self.letters = None
136
137 - def __getattr__(self, key):
138 if key[:2] == "__" and key[-2:] == "__": 139 raise AttributeError(key) 140 return getattr(self.alphabet, key)
141
142 - def __repr__(self):
143 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 144 self.new_letters)
145
146 - def contains(self, other):
147 """Does this alphabet 'contain' the other (OBSOLETE?). 148 149 This is isn't implemented for the base AlphabetEncoder, 150 which will always return 0 (False).""" 151 return 0
152
153 - def _upper(self):
154 """Return an upper case variant of the current alphabet (PRIVATE).""" 155 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
156
157 - def _lower(self):
158 """Return a lower case variant of the current alphabet (PRIVATE).""" 159 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
160 161
162 -class Gapped(AlphabetEncoder):
163 - def __init__(self, alphabet, gap_char = "-"):
164 AlphabetEncoder.__init__(self, alphabet, gap_char) 165 self.gap_char = gap_char
166
167 - def contains(self, other):
168 """Does this alphabet 'contain' the other (OBSOLETE?). 169 170 Returns a boolean. This relies on the Alphabet subclassing 171 hierarchy, and attempts to check the gap character. This fails 172 if the other alphabet does not have a gap character! 173 """ 174 return other.gap_char == self.gap_char and \ 175 self.alphabet.contains(other.alphabet)
176
177 - def _upper(self):
178 """Return an upper case variant of the current alphabet (PRIVATE).""" 179 return Gapped(self.alphabet._upper(), self.gap_char.upper())
180
181 - def _lower(self):
182 """Return a lower case variant of the current alphabet (PRIVATE).""" 183 return Gapped(self.alphabet._lower(), self.gap_char.lower())
184 185
186 -class HasStopCodon(AlphabetEncoder):
187 - def __init__(self, alphabet, stop_symbol = "*"):
188 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 189 self.stop_symbol = stop_symbol
190
191 - def __cmp__(self, other):
192 x = cmp(self.alphabet, other.alphabet) 193 if x == 0: 194 return cmp(self.stop_symbol, other.stop_symbol) 195 return x
196
197 - def contains(self, other):
198 """Does this alphabet 'contain' the other (OBSOLETE?). 199 200 Returns a boolean. This relies on the Alphabet subclassing 201 hierarchy, and attempts to check the stop symbol. This fails 202 if the other alphabet does not have a stop symbol! 203 """ 204 return other.stop_symbol == self.stop_symbol and \ 205 self.alphabet.contains(other.alphabet)
206
207 - def _upper(self):
208 """Return an upper case variant of the current alphabet (PRIVATE).""" 209 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
210
211 - def _lower(self):
212 """Return a lower case variant of the current alphabet (PRIVATE).""" 213 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
214 215
216 -def _get_base_alphabet(alphabet):
217 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 218 a = alphabet 219 while isinstance(a, AlphabetEncoder): 220 a = a.alphabet 221 assert isinstance(a, Alphabet), \ 222 "Invalid alphabet found, %s" % repr(a) 223 return a
224 225
226 -def _ungap(alphabet):
227 """Returns the alphabet without any gap encoder (PRIVATE).""" 228 #TODO - Handle via method of the objects? 229 if not hasattr(alphabet, "gap_char"): 230 return alphabet 231 elif isinstance(alphabet, Gapped): 232 return alphabet.alphabet 233 elif isinstance(alphabet, HasStopCodon): 234 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 235 elif isinstance(alphabet, AlphabetEncoder): 236 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 237 else: 238 raise NotImplementedError
239 240
241 -def _consensus_base_alphabet(alphabets):
242 """Returns a common but often generic base alphabet object (PRIVATE). 243 244 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 245 246 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 247 letter. These DO NOT raise an exception!""" 248 common = None 249 for alpha in alphabets: 250 a = _get_base_alphabet(alpha) 251 if common is None: 252 common = a 253 elif common == a: 254 pass 255 elif isinstance(a, common.__class__): 256 pass 257 elif isinstance(common, a.__class__): 258 common = a 259 elif isinstance(a, NucleotideAlphabet) \ 260 and isinstance(common, NucleotideAlphabet): 261 #e.g. Give a mix of RNA and DNA alphabets 262 common = generic_nucleotide 263 elif isinstance(a, SingleLetterAlphabet) \ 264 and isinstance(common, SingleLetterAlphabet): 265 #This is a pretty big mis-match! 266 common = single_letter_alphabet 267 else: 268 #We have a major mis-match... take the easy way out! 269 return generic_alphabet 270 if common is None: 271 #Given NO alphabets! 272 return generic_alphabet 273 return common
274 275
276 -def _consensus_alphabet(alphabets):
277 """Returns a common but often generic alphabet object (PRIVATE). 278 279 >>> from Bio.Alphabet import IUPAC 280 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein]) 281 ExtendedIUPACProtein() 282 >>> _consensus_alphabet([generic_protein, IUPAC.protein]) 283 ProteinAlphabet() 284 285 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 286 letter. These DO NOT raise an exception! 287 288 >>> _consensus_alphabet([generic_dna, generic_nucleotide]) 289 NucleotideAlphabet() 290 >>> _consensus_alphabet([generic_dna, generic_rna]) 291 NucleotideAlphabet() 292 >>> _consensus_alphabet([generic_dna, generic_protein]) 293 SingleLetterAlphabet() 294 >>> _consensus_alphabet([single_letter_alphabet, generic_protein]) 295 SingleLetterAlphabet() 296 297 This is aware of Gapped and HasStopCodon and new letters added by 298 other AlphabetEncoders. This WILL raise an exception if more than 299 one gap character or stop symbol is present. 300 301 >>> from Bio.Alphabet import IUPAC 302 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)]) 303 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*') 304 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")]) 305 Traceback (most recent call last): 306 ... 307 ValueError: More than one gap character present 308 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")]) 309 Traceback (most recent call last): 310 ... 311 ValueError: More than one stop symbol present 312 """ 313 base = _consensus_base_alphabet(alphabets) 314 gap = None 315 stop = None 316 new_letters = "" 317 for alpha in alphabets: 318 #Gaps... 319 if not hasattr(alpha, "gap_char"): 320 pass 321 elif gap is None: 322 gap = alpha.gap_char 323 elif gap == alpha.gap_char: 324 pass 325 else: 326 raise ValueError("More than one gap character present") 327 #Stops... 328 if not hasattr(alpha, "stop_symbol"): 329 pass 330 elif stop is None: 331 stop = alpha.stop_symbol 332 elif stop == alpha.stop_symbol: 333 pass 334 else: 335 raise ValueError("More than one stop symbol present") 336 #New letters... 337 if hasattr(alpha, "new_letters"): 338 for letter in alpha.new_letters: 339 if letter not in new_letters \ 340 and letter != gap and letter != stop: 341 new_letters += letter 342 343 alpha = base 344 if new_letters: 345 alpha = AlphabetEncoder(alpha, new_letters) 346 if gap: 347 alpha = Gapped(alpha, gap_char=gap) 348 if stop: 349 alpha = HasStopCodon(alpha, stop_symbol=stop) 350 return alpha
351 352
353 -def _check_type_compatible(alphabets):
354 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 355 356 >>> _check_type_compatible([generic_dna, generic_nucleotide]) 357 True 358 >>> _check_type_compatible([generic_dna, generic_rna]) 359 False 360 >>> _check_type_compatible([generic_dna, generic_protein]) 361 False 362 >>> _check_type_compatible([single_letter_alphabet, generic_protein]) 363 True 364 365 This relies on the Alphabet subclassing hierarchy. It does not 366 check things like gap characters or stop symbols.""" 367 dna, rna, nucl, protein = False, False, False, False 368 for alpha in alphabets: 369 a = _get_base_alphabet(alpha) 370 if isinstance(a, DNAAlphabet): 371 dna = True 372 nucl = True 373 if rna or protein: 374 return False 375 elif isinstance(a, RNAAlphabet): 376 rna = True 377 nucl = True 378 if dna or protein: 379 return False 380 elif isinstance(a, NucleotideAlphabet): 381 nucl = True 382 if protein: 383 return False 384 elif isinstance(a, ProteinAlphabet): 385 protein = True 386 if nucl: 387 return False 388 return True
389 390
391 -def _verify_alphabet(sequence):
392 """Check all letters in sequence are in the alphabet (PRIVATE). 393 394 >>> from Bio.Seq import Seq 395 >>> from Bio.Alphabet import IUPAC 396 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 397 ... IUPAC.protein) 398 >>> _verify_alphabet(my_seq) 399 True 400 401 This example has an X, which is not in the IUPAC protein alphabet 402 (you should be using the IUPAC extended protein alphabet): 403 404 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX", 405 ... IUPAC.protein) 406 >>> _verify_alphabet(bad_seq) 407 False 408 409 This replaces Bio.utils.verify_alphabet() since we are deprecating 410 that. Potentially this could be added to the Alphabet object, and 411 I would like it to be an option when creating a Seq object... but 412 that might slow things down. 413 """ 414 letters = sequence.alphabet.letters 415 if not letters: 416 raise ValueError("Alphabet does not define letters.") 417 for letter in sequence: 418 if letter not in letters: 419 return False 420 return True
421