1
2
3
4
5
6
7
8 """Alphabets used in Seq objects etc to declare sequence type and letters.
9
10 This is used by sequences which contain a finite number of similar words.
11 """
12
13
15 size = None
16 letters = None
17
18
19
20
21
23 return self.__class__.__name__ + "()"
24
26 """Does this alphabet 'contain' the other (OBSOLETE?).
27
28 Returns a boolean. This relies on the Alphabet subclassing
29 hierarchy only, and does not check the letters property.
30 This isn't ideal, and doesn't seem to work as intended
31 with the AlphabetEncoder classes."""
32 return isinstance(other, self.__class__)
33
49
51 """Return an upper case variant of the current alphabet (PRIVATE)."""
52 if not self.letters or self.letters==self.letters.upper():
53
54 return self
55 else:
56
57 return self._case_less()
58
60 """Return a lower case variant of the current alphabet (PRIVATE)."""
61 if not self.letters or self.letters==self.letters.lower():
62
63 return self
64 else:
65
66 return self._case_less()
67
68 generic_alphabet = Alphabet()
69
70
74
75 single_letter_alphabet = SingleLetterAlphabet()
76
77
78
79
82
83 generic_protein = ProteinAlphabet()
84
85
86
87
90
91 generic_nucleotide = NucleotideAlphabet()
92
93
96
97 generic_dna = DNAAlphabet()
98
99
100
101
102
105
106 generic_rna = RNAAlphabet()
107
108
109
110
113
114
116 size = 3
117 letters = [
118 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
119 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
120 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx",
121 ]
122
123
124
125
126
127
129 - def __init__(self, alphabet, new_letters):
136
138 if key[:2] == "__" and key[-2:] == "__":
139 raise AttributeError(key)
140 return getattr(self.alphabet, key)
141
143 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet,
144 self.new_letters)
145
147 """Does this alphabet 'contain' the other (OBSOLETE?).
148
149 This is isn't implemented for the base AlphabetEncoder,
150 which will always return 0 (False)."""
151 return 0
152
156
160
161
162 -class Gapped(AlphabetEncoder):
163 - def __init__(self, alphabet, gap_char = "-"):
166
168 """Does this alphabet 'contain' the other (OBSOLETE?).
169
170 Returns a boolean. This relies on the Alphabet subclassing
171 hierarchy, and attempts to check the gap character. This fails
172 if the other alphabet does not have a gap character!
173 """
174 return other.gap_char == self.gap_char and \
175 self.alphabet.contains(other.alphabet)
176
178 """Return an upper case variant of the current alphabet (PRIVATE)."""
179 return Gapped(self.alphabet._upper(), self.gap_char.upper())
180
182 """Return a lower case variant of the current alphabet (PRIVATE)."""
183 return Gapped(self.alphabet._lower(), self.gap_char.lower())
184
185
187 - def __init__(self, alphabet, stop_symbol = "*"):
190
196
198 """Does this alphabet 'contain' the other (OBSOLETE?).
199
200 Returns a boolean. This relies on the Alphabet subclassing
201 hierarchy, and attempts to check the stop symbol. This fails
202 if the other alphabet does not have a stop symbol!
203 """
204 return other.stop_symbol == self.stop_symbol and \
205 self.alphabet.contains(other.alphabet)
206
210
214
215
217 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE)."""
218 a = alphabet
219 while isinstance(a, AlphabetEncoder):
220 a = a.alphabet
221 assert isinstance(a, Alphabet), \
222 "Invalid alphabet found, %s" % repr(a)
223 return a
224
225
239
240
242 """Returns a common but often generic base alphabet object (PRIVATE).
243
244 This throws away any AlphabetEncoder information, e.g. Gapped alphabets.
245
246 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
247 letter. These DO NOT raise an exception!"""
248 common = None
249 for alpha in alphabets:
250 a = _get_base_alphabet(alpha)
251 if common is None:
252 common = a
253 elif common == a:
254 pass
255 elif isinstance(a, common.__class__):
256 pass
257 elif isinstance(common, a.__class__):
258 common = a
259 elif isinstance(a, NucleotideAlphabet) \
260 and isinstance(common, NucleotideAlphabet):
261
262 common = generic_nucleotide
263 elif isinstance(a, SingleLetterAlphabet) \
264 and isinstance(common, SingleLetterAlphabet):
265
266 common = single_letter_alphabet
267 else:
268
269 return generic_alphabet
270 if common is None:
271
272 return generic_alphabet
273 return common
274
275
277 """Returns a common but often generic alphabet object (PRIVATE).
278
279 >>> from Bio.Alphabet import IUPAC
280 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein])
281 ExtendedIUPACProtein()
282 >>> _consensus_alphabet([generic_protein, IUPAC.protein])
283 ProteinAlphabet()
284
285 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
286 letter. These DO NOT raise an exception!
287
288 >>> _consensus_alphabet([generic_dna, generic_nucleotide])
289 NucleotideAlphabet()
290 >>> _consensus_alphabet([generic_dna, generic_rna])
291 NucleotideAlphabet()
292 >>> _consensus_alphabet([generic_dna, generic_protein])
293 SingleLetterAlphabet()
294 >>> _consensus_alphabet([single_letter_alphabet, generic_protein])
295 SingleLetterAlphabet()
296
297 This is aware of Gapped and HasStopCodon and new letters added by
298 other AlphabetEncoders. This WILL raise an exception if more than
299 one gap character or stop symbol is present.
300
301 >>> from Bio.Alphabet import IUPAC
302 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)])
303 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*')
304 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")])
305 Traceback (most recent call last):
306 ...
307 ValueError: More than one gap character present
308 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")])
309 Traceback (most recent call last):
310 ...
311 ValueError: More than one stop symbol present
312 """
313 base = _consensus_base_alphabet(alphabets)
314 gap = None
315 stop = None
316 new_letters = ""
317 for alpha in alphabets:
318
319 if not hasattr(alpha, "gap_char"):
320 pass
321 elif gap is None:
322 gap = alpha.gap_char
323 elif gap == alpha.gap_char:
324 pass
325 else:
326 raise ValueError("More than one gap character present")
327
328 if not hasattr(alpha, "stop_symbol"):
329 pass
330 elif stop is None:
331 stop = alpha.stop_symbol
332 elif stop == alpha.stop_symbol:
333 pass
334 else:
335 raise ValueError("More than one stop symbol present")
336
337 if hasattr(alpha, "new_letters"):
338 for letter in alpha.new_letters:
339 if letter not in new_letters \
340 and letter != gap and letter != stop:
341 new_letters += letter
342
343 alpha = base
344 if new_letters:
345 alpha = AlphabetEncoder(alpha, new_letters)
346 if gap:
347 alpha = Gapped(alpha, gap_char=gap)
348 if stop:
349 alpha = HasStopCodon(alpha, stop_symbol=stop)
350 return alpha
351
352
354 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE).
355
356 >>> _check_type_compatible([generic_dna, generic_nucleotide])
357 True
358 >>> _check_type_compatible([generic_dna, generic_rna])
359 False
360 >>> _check_type_compatible([generic_dna, generic_protein])
361 False
362 >>> _check_type_compatible([single_letter_alphabet, generic_protein])
363 True
364
365 This relies on the Alphabet subclassing hierarchy. It does not
366 check things like gap characters or stop symbols."""
367 dna, rna, nucl, protein = False, False, False, False
368 for alpha in alphabets:
369 a = _get_base_alphabet(alpha)
370 if isinstance(a, DNAAlphabet):
371 dna = True
372 nucl = True
373 if rna or protein:
374 return False
375 elif isinstance(a, RNAAlphabet):
376 rna = True
377 nucl = True
378 if dna or protein:
379 return False
380 elif isinstance(a, NucleotideAlphabet):
381 nucl = True
382 if protein:
383 return False
384 elif isinstance(a, ProteinAlphabet):
385 protein = True
386 if nucl:
387 return False
388 return True
389
390
392 """Check all letters in sequence are in the alphabet (PRIVATE).
393
394 >>> from Bio.Seq import Seq
395 >>> from Bio.Alphabet import IUPAC
396 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
397 ... IUPAC.protein)
398 >>> _verify_alphabet(my_seq)
399 True
400
401 This example has an X, which is not in the IUPAC protein alphabet
402 (you should be using the IUPAC extended protein alphabet):
403
404 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX",
405 ... IUPAC.protein)
406 >>> _verify_alphabet(bad_seq)
407 False
408
409 This replaces Bio.utils.verify_alphabet() since we are deprecating
410 that. Potentially this could be added to the Alphabet object, and
411 I would like it to be an option when creating a Seq object... but
412 that might slow things down.
413 """
414 letters = sequence.alphabet.letters
415 if not letters:
416 raise ValueError("Alphabet does not define letters.")
417 for letter in sequence:
418 if letter not in letters:
419 return False
420 return True
421