Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2014 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also the Seq_ wiki and the chapter in our tutorial: 
  11      - `HTML Tutorial`_ 
  12      - `PDF Tutorial`_ 
  13   
  14  .. _Seq: http://biopython.org/wiki/Seq 
  15  .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html 
  16  .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf 
  17  """ 
  18  from __future__ import print_function 
  19   
  20  __docformat__ = "restructuredtext en"  # Don't just use plain text in epydoc API pages! 
  21   
  22  import string  # for maketrans only 
  23  import array 
  24  import sys 
  25  import warnings 
  26   
  27  from Bio._py3k import range 
  28  from Bio._py3k import basestring 
  29   
  30  from Bio import BiopythonWarning 
  31  from Bio import Alphabet 
  32  from Bio.Alphabet import IUPAC 
  33  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  34  from Bio.Data import CodonTable 
  35   
  36   
37 -def _maketrans(complement_mapping):
38 """Makes a python string translation table (PRIVATE). 39 40 Arguments: 41 - complement_mapping - a dictionary such as ambiguous_dna_complement 42 and ambiguous_rna_complement from Data.IUPACData. 43 44 Returns a translation table (a string of length 256) for use with the 45 python string's translate method to use in a (reverse) complement. 46 47 Compatible with lower case and upper case sequences. 48 49 For internal use only. 50 """ 51 before = ''.join(complement_mapping.keys()) 52 after = ''.join(complement_mapping.values()) 53 before += before.lower() 54 after += after.lower() 55 if sys.version_info[0] == 3: 56 return str.maketrans(before, after) 57 else: 58 return string.maketrans(before, after)
59 60 _dna_complement_table = _maketrans(ambiguous_dna_complement) 61 _rna_complement_table = _maketrans(ambiguous_rna_complement) 62 63
64 -class Seq(object):
65 """A read-only sequence object (essentially a string with an alphabet). 66 67 Like normal python strings, our basic sequence object is immutable. 68 This prevents you from doing my_seq[5] = "A" for example, but does allow 69 Seq objects to be used as dictionary keys. 70 71 The Seq object provides a number of string like methods (such as count, 72 find, split and strip), which are alphabet aware where appropriate. 73 74 In addition to the string like sequence, the Seq object has an alphabet 75 property. This is an instance of an Alphabet class from Bio.Alphabet, 76 for example generic DNA, or IUPAC DNA. This describes the type of molecule 77 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 78 (letters). 79 80 The Seq object also provides some biological methods, such as complement, 81 reverse_complement, transcribe, back_transcribe and translate (which are 82 not applicable to sequences with a protein alphabet). 83 """
84 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
85 """Create a Seq object. 86 87 Arguments: 88 - seq - Sequence, required (string) 89 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 90 91 You will typically use Bio.SeqIO to read in sequences from files as 92 SeqRecord objects, whose sequence will be exposed as a Seq object via 93 the seq property. 94 95 However, will often want to create your own Seq objects directly: 96 97 >>> from Bio.Seq import Seq 98 >>> from Bio.Alphabet import IUPAC 99 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 100 ... IUPAC.protein) 101 >>> my_seq 102 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 103 >>> print(my_seq) 104 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 105 >>> my_seq.alphabet 106 IUPACProtein() 107 """ 108 # Enforce string storage 109 if not isinstance(data, basestring): 110 raise TypeError("The sequence data given to a Seq object should " 111 "be a string (not another Seq object etc)") 112 self._data = data 113 self.alphabet = alphabet # Seq API requirement
114
115 - def __repr__(self):
116 """Returns a (truncated) representation of the sequence for debugging.""" 117 if len(self) > 60: 118 # Shows the last three letters as it is often useful to see if there 119 # is a stop codon at the end of a sequence. 120 # Note total length is 54+3+3=60 121 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 122 str(self)[:54], 123 str(self)[-3:], 124 self.alphabet) 125 else: 126 return '{0}({1!r}, {2!r})'.format(self.__class__.__name__, 127 self._data, 128 self.alphabet)
129
130 - def __str__(self):
131 """Returns the full sequence as a python string, use str(my_seq). 132 133 Note that Biopython 1.44 and earlier would give a truncated 134 version of repr(my_seq) for str(my_seq). If you are writing code 135 which need to be backwards compatible with old Biopython, you 136 should continue to use my_seq.tostring() rather than str(my_seq). 137 """ 138 return self._data
139
140 - def __hash__(self):
141 """Hash for comparison. 142 143 See the __cmp__ documentation - this has changed from past 144 versions of Biopython! 145 """ 146 # TODO - remove this warning in a future release 147 warnings.warn("Biopython Seq objects now use string comparison. " 148 "Older versions of Biopython used object comparison. " 149 "During this transition, please use hash(id(my_seq)) " 150 "or my_dict[id(my_seq)] if you want the old behaviour, " 151 "or use hash(str(my_seq)) or my_dict[str(my_seq)] for " 152 "the new string hashing behaviour.", BiopythonWarning) 153 return hash(str(self))
154
155 - def __eq__(self, other):
156 """Compare the sequence to another sequence or a string (README). 157 158 Historically comparing Seq objects has done Python object comparison. 159 After considerable discussion (keeping in mind constraints of the 160 Python language, hashes and dictionary support), Biopython now uses 161 simple string comparison (with a warning about the change). 162 163 Note that incompatible alphabets (e.g. DNA to RNA) will trigger a 164 warning. 165 166 During this transition period, please just do explicit comparisons: 167 168 >>> from Bio.Seq import Seq 169 >>> from Bio.Alphabet import generic_dna 170 >>> seq1 = Seq("ACGT") 171 >>> seq2 = Seq("ACGT") 172 >>> id(seq1) == id(seq2) 173 False 174 >>> str(seq1) == str(seq2) 175 True 176 177 The new behaviour is to use string-like equality: 178 179 >>> from Bio.Seq import Seq 180 >>> from Bio.Alphabet import generic_dna 181 >>> seq1 == seq2 182 True 183 >>> seq1 == "ACGT" 184 True 185 >>> seq1 == Seq("ACGT", generic_dna) 186 True 187 188 """ 189 if hasattr(other, "alphabet"): 190 # other could be a Seq or a MutableSeq 191 if not Alphabet._check_type_compatible([self.alphabet, 192 other.alphabet]): 193 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 194 self.alphabet, other.alphabet), 195 BiopythonWarning) 196 return str(self) == str(other)
197
198 - def __ne__(self, other):
199 """Not equal, see __eq__ documentation.""" 200 # Seem to require this method under Python 2 but not needed on Python 3? 201 return not (self == other)
202
203 - def __lt__(self, other):
204 """Less than, see __eq__ documentation.""" 205 if hasattr(other, "alphabet"): 206 if not Alphabet._check_type_compatible([self.alphabet, 207 other.alphabet]): 208 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 209 self.alphabet, other.alphabet), 210 BiopythonWarning) 211 return str(self) < str(other)
212
213 - def __le__(self, other):
214 """Less than or equal, see __eq__ documentation.""" 215 if hasattr(other, "alphabet"): 216 if not Alphabet._check_type_compatible([self.alphabet, 217 other.alphabet]): 218 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 219 self.alphabet, other.alphabet), 220 BiopythonWarning) 221 return str(self) <= str(other)
222
223 - def __len__(self):
224 """Returns the length of the sequence, use len(my_seq).""" 225 return len(self._data) # Seq API requirement
226
227 - def __getitem__(self, index): # Seq API requirement
228 """Returns a subsequence of single letter, use my_seq[index].""" 229 # Note since Python 2.0, __getslice__ is deprecated 230 # and __getitem__ is used instead. 231 # See http://docs.python.org/ref/sequence-methods.html 232 if isinstance(index, int): 233 # Return a single letter as a string 234 return self._data[index] 235 else: 236 # Return the (sub)sequence as another Seq object 237 return Seq(self._data[index], self.alphabet)
238
239 - def __add__(self, other):
240 """Add another sequence or string to this sequence. 241 242 If adding a string to a Seq, the alphabet is preserved: 243 244 >>> from Bio.Seq import Seq 245 >>> from Bio.Alphabet import generic_protein 246 >>> Seq("MELKI", generic_protein) + "LV" 247 Seq('MELKILV', ProteinAlphabet()) 248 249 When adding two Seq (like) objects, the alphabets are important. 250 Consider this example: 251 252 >>> from Bio.Seq import Seq 253 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 254 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 255 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 256 >>> unamb_dna_seq 257 Seq('ACGT', IUPACUnambiguousDNA()) 258 >>> ambig_dna_seq 259 Seq('ACRGT', IUPACAmbiguousDNA()) 260 261 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 262 the more general ambiguous IUPAC DNA alphabet: 263 264 >>> unamb_dna_seq + ambig_dna_seq 265 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 266 267 However, if the default generic alphabet is included, the result is 268 a generic alphabet: 269 270 >>> Seq("") + ambig_dna_seq 271 Seq('ACRGT', Alphabet()) 272 273 You can't add RNA and DNA sequences: 274 275 >>> from Bio.Alphabet import generic_dna, generic_rna 276 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 277 Traceback (most recent call last): 278 ... 279 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 280 281 You can't add nucleotide and protein sequences: 282 283 >>> from Bio.Alphabet import generic_dna, generic_protein 284 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 285 Traceback (most recent call last): 286 ... 287 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 288 """ 289 if hasattr(other, "alphabet"): 290 # other should be a Seq or a MutableSeq 291 if not Alphabet._check_type_compatible([self.alphabet, 292 other.alphabet]): 293 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 294 self.alphabet, other.alphabet)) 295 # They should be the same sequence type (or one of them is generic) 296 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 297 return self.__class__(str(self) + str(other), a) 298 elif isinstance(other, basestring): 299 # other is a plain string - use the current alphabet 300 return self.__class__(str(self) + other, self.alphabet) 301 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 302 if isinstance(other, SeqRecord): 303 # Get the SeqRecord's __radd__ to handle this 304 return NotImplemented 305 else: 306 raise TypeError
307
308 - def __radd__(self, other):
309 """Adding a sequence on the left. 310 311 If adding a string to a Seq, the alphabet is preserved: 312 313 >>> from Bio.Seq import Seq 314 >>> from Bio.Alphabet import generic_protein 315 >>> "LV" + Seq("MELKI", generic_protein) 316 Seq('LVMELKI', ProteinAlphabet()) 317 318 Adding two Seq (like) objects is handled via the __add__ method. 319 """ 320 if hasattr(other, "alphabet"): 321 # other should be a Seq or a MutableSeq 322 if not Alphabet._check_type_compatible([self.alphabet, 323 other.alphabet]): 324 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 325 self.alphabet, other.alphabet)) 326 # They should be the same sequence type (or one of them is generic) 327 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 328 return self.__class__(str(other) + str(self), a) 329 elif isinstance(other, basestring): 330 # other is a plain string - use the current alphabet 331 return self.__class__(other + str(self), self.alphabet) 332 else: 333 raise TypeError
334
335 - def tostring(self): # Seq API requirement
336 """Returns the full sequence as a python string (DEPRECATED). 337 338 You are now encouraged to use str(my_seq) instead of 339 my_seq.tostring().""" 340 from Bio import BiopythonDeprecationWarning 341 warnings.warn("This method is obsolete; please use str(my_seq) " 342 "instead of my_seq.tostring().", 343 BiopythonDeprecationWarning) 344 return str(self) 345
346 - def tomutable(self): # Needed? Or use a function?
347 """Returns the full sequence as a MutableSeq object. 348 349 >>> from Bio.Seq import Seq 350 >>> from Bio.Alphabet import IUPAC 351 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 352 ... IUPAC.protein) 353 >>> my_seq 354 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 355 >>> my_seq.tomutable() 356 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 357 358 Note that the alphabet is preserved. 359 """ 360 return MutableSeq(str(self), self.alphabet) 361
362 - def _get_seq_str_and_check_alphabet(self, other_sequence):
363 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 364 365 For a string argument, returns the string. 366 367 For a Seq or MutableSeq, it checks the alphabet is compatible 368 (raising an exception if it isn't), and then returns a string. 369 """ 370 try: 371 other_alpha = other_sequence.alphabet 372 except AttributeError: 373 # Assume other_sequence is a string 374 return other_sequence 375 376 # Other should be a Seq or a MutableSeq 377 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 378 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 379 self.alphabet, other_alpha)) 380 # Return as a string 381 return str(other_sequence)
382
383 - def count(self, sub, start=0, end=sys.maxsize):
384 """Non-overlapping count method, like that of a python string. 385 386 This behaves like the python string method of the same name, 387 which does a non-overlapping count! 388 389 Returns an integer, the number of occurrences of substring 390 argument sub in the (sub)sequence given by [start:end]. 391 Optional arguments start and end are interpreted as in slice 392 notation. 393 394 Arguments: 395 - sub - a string or another Seq object to look for 396 - start - optional integer, slice start 397 - end - optional integer, slice end 398 399 e.g. 400 401 >>> from Bio.Seq import Seq 402 >>> my_seq = Seq("AAAATGA") 403 >>> print(my_seq.count("A")) 404 5 405 >>> print(my_seq.count("ATG")) 406 1 407 >>> print(my_seq.count(Seq("AT"))) 408 1 409 >>> print(my_seq.count("AT", 2, -1)) 410 1 411 412 HOWEVER, please note because python strings and Seq objects (and 413 MutableSeq objects) do a non-overlapping search, this may not give 414 the answer you expect: 415 416 >>> "AAAA".count("AA") 417 2 418 >>> print(Seq("AAAA").count("AA")) 419 2 420 421 An overlapping search would give the answer as three! 422 """ 423 # If it has one, check the alphabet: 424 sub_str = self._get_seq_str_and_check_alphabet(sub) 425 return str(self).count(sub_str, start, end)
426
427 - def __contains__(self, char):
428 """Implements the 'in' keyword, like a python string. 429 430 e.g. 431 432 >>> from Bio.Seq import Seq 433 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 434 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 435 >>> "AAA" in my_dna 436 True 437 >>> Seq("AAA") in my_dna 438 True 439 >>> Seq("AAA", generic_dna) in my_dna 440 True 441 442 Like other Seq methods, this will raise a type error if another Seq 443 (or Seq like) object with an incompatible alphabet is used: 444 445 >>> Seq("AAA", generic_rna) in my_dna 446 Traceback (most recent call last): 447 ... 448 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 449 >>> Seq("AAA", generic_protein) in my_dna 450 Traceback (most recent call last): 451 ... 452 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 453 """ 454 # If it has one, check the alphabet: 455 sub_str = self._get_seq_str_and_check_alphabet(char) 456 return sub_str in str(self)
457
458 - def find(self, sub, start=0, end=sys.maxsize):
459 """Find method, like that of a python string. 460 461 This behaves like the python string method of the same name. 462 463 Returns an integer, the index of the first occurrence of substring 464 argument sub in the (sub)sequence given by [start:end]. 465 466 Arguments: 467 - sub - a string or another Seq object to look for 468 - start - optional integer, slice start 469 - end - optional integer, slice end 470 471 Returns -1 if the subsequence is NOT found. 472 473 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 474 475 >>> from Bio.Seq import Seq 476 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 477 >>> my_rna.find("AUG") 478 3 479 """ 480 # If it has one, check the alphabet: 481 sub_str = self._get_seq_str_and_check_alphabet(sub) 482 return str(self).find(sub_str, start, end)
483
484 - def rfind(self, sub, start=0, end=sys.maxsize):
485 """Find from right method, like that of a python string. 486 487 This behaves like the python string method of the same name. 488 489 Returns an integer, the index of the last (right most) occurrence of 490 substring argument sub in the (sub)sequence given by [start:end]. 491 492 Arguments: 493 - sub - a string or another Seq object to look for 494 - start - optional integer, slice start 495 - end - optional integer, slice end 496 497 Returns -1 if the subsequence is NOT found. 498 499 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 500 501 >>> from Bio.Seq import Seq 502 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 503 >>> my_rna.rfind("AUG") 504 15 505 """ 506 # If it has one, check the alphabet: 507 sub_str = self._get_seq_str_and_check_alphabet(sub) 508 return str(self).rfind(sub_str, start, end)
509
510 - def startswith(self, prefix, start=0, end=sys.maxsize):
511 """Does the Seq start with the given prefix? Returns True/False. 512 513 This behaves like the python string method of the same name. 514 515 Return True if the sequence starts with the specified prefix 516 (a string or another Seq object), False otherwise. 517 With optional start, test sequence beginning at that position. 518 With optional end, stop comparing sequence at that position. 519 prefix can also be a tuple of strings to try. e.g. 520 521 >>> from Bio.Seq import Seq 522 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 523 >>> my_rna.startswith("GUC") 524 True 525 >>> my_rna.startswith("AUG") 526 False 527 >>> my_rna.startswith("AUG", 3) 528 True 529 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 530 True 531 """ 532 # If it has one, check the alphabet: 533 if isinstance(prefix, tuple): 534 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 535 for p in prefix) 536 return str(self).startswith(prefix_strs, start, end) 537 else: 538 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 539 return str(self).startswith(prefix_str, start, end)
540
541 - def endswith(self, suffix, start=0, end=sys.maxsize):
542 """Does the Seq end with the given suffix? Returns True/False. 543 544 This behaves like the python string method of the same name. 545 546 Return True if the sequence ends with the specified suffix 547 (a string or another Seq object), False otherwise. 548 With optional start, test sequence beginning at that position. 549 With optional end, stop comparing sequence at that position. 550 suffix can also be a tuple of strings to try. e.g. 551 552 >>> from Bio.Seq import Seq 553 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 554 >>> my_rna.endswith("UUG") 555 True 556 >>> my_rna.endswith("AUG") 557 False 558 >>> my_rna.endswith("AUG", 0, 18) 559 True 560 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 561 True 562 """ 563 # If it has one, check the alphabet: 564 if isinstance(suffix, tuple): 565 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 566 for p in suffix) 567 return str(self).endswith(suffix_strs, start, end) 568 else: 569 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 570 return str(self).endswith(suffix_str, start, end)
571
572 - def split(self, sep=None, maxsplit=-1):
573 """Split method, like that of a python string. 574 575 This behaves like the python string method of the same name. 576 577 Return a list of the 'words' in the string (as Seq objects), 578 using sep as the delimiter string. If maxsplit is given, at 579 most maxsplit splits are done. If maxsplit is omitted, all 580 splits are made. 581 582 Following the python string method, sep will by default be any 583 white space (tabs, spaces, newlines) but this is unlikely to 584 apply to biological sequences. 585 586 e.g. 587 588 >>> from Bio.Seq import Seq 589 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 590 >>> my_aa = my_rna.translate() 591 >>> my_aa 592 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 593 >>> my_aa.split("*") 594 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 595 >>> my_aa.split("*", 1) 596 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 597 598 See also the rsplit method: 599 600 >>> my_aa.rsplit("*", 1) 601 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 602 """ 603 # If it has one, check the alphabet: 604 sep_str = self._get_seq_str_and_check_alphabet(sep) 605 # TODO - If the sep is the defined stop symbol, or gap char, 606 # should we adjust the alphabet? 607 return [Seq(part, self.alphabet) 608 for part in str(self).split(sep_str, maxsplit)]
609
610 - def rsplit(self, sep=None, maxsplit=-1):
611 """Right split method, like that of a python string. 612 613 This behaves like the python string method of the same name. 614 615 Return a list of the 'words' in the string (as Seq objects), 616 using sep as the delimiter string. If maxsplit is given, at 617 most maxsplit splits are done COUNTING FROM THE RIGHT. 618 If maxsplit is omitted, all splits are made. 619 620 Following the python string method, sep will by default be any 621 white space (tabs, spaces, newlines) but this is unlikely to 622 apply to biological sequences. 623 624 e.g. print(my_seq.rsplit("*",1)) 625 626 See also the split method. 627 """ 628 # If it has one, check the alphabet: 629 sep_str = self._get_seq_str_and_check_alphabet(sep) 630 return [Seq(part, self.alphabet) 631 for part in str(self).rsplit(sep_str, maxsplit)]
632
633 - def strip(self, chars=None):
634 """Returns a new Seq object with leading and trailing ends stripped. 635 636 This behaves like the python string method of the same name. 637 638 Optional argument chars defines which characters to remove. If 639 omitted or None (default) then as for the python string method, 640 this defaults to removing any white space. 641 642 e.g. print(my_seq.strip("-")) 643 644 See also the lstrip and rstrip methods. 645 """ 646 # If it has one, check the alphabet: 647 strip_str = self._get_seq_str_and_check_alphabet(chars) 648 return Seq(str(self).strip(strip_str), self.alphabet)
649
650 - def lstrip(self, chars=None):
651 """Returns a new Seq object with leading (left) end stripped. 652 653 This behaves like the python string method of the same name. 654 655 Optional argument chars defines which characters to remove. If 656 omitted or None (default) then as for the python string method, 657 this defaults to removing any white space. 658 659 e.g. print(my_seq.lstrip("-")) 660 661 See also the strip and rstrip methods. 662 """ 663 # If it has one, check the alphabet: 664 strip_str = self._get_seq_str_and_check_alphabet(chars) 665 return Seq(str(self).lstrip(strip_str), self.alphabet)
666
667 - def rstrip(self, chars=None):
668 """Returns a new Seq object with trailing (right) end stripped. 669 670 This behaves like the python string method of the same name. 671 672 Optional argument chars defines which characters to remove. If 673 omitted or None (default) then as for the python string method, 674 this defaults to removing any white space. 675 676 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 677 678 >>> from Bio.Alphabet import IUPAC 679 >>> from Bio.Seq import Seq 680 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 681 >>> my_seq 682 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 683 >>> my_seq.rstrip("A") 684 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 685 686 See also the strip and lstrip methods. 687 """ 688 # If it has one, check the alphabet: 689 strip_str = self._get_seq_str_and_check_alphabet(chars) 690 return Seq(str(self).rstrip(strip_str), self.alphabet)
691
692 - def upper(self):
693 """Returns an upper case copy of the sequence. 694 695 >>> from Bio.Alphabet import HasStopCodon, generic_protein 696 >>> from Bio.Seq import Seq 697 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 698 >>> my_seq 699 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 700 >>> my_seq.lower() 701 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 702 >>> my_seq.upper() 703 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 704 705 This will adjust the alphabet if required. See also the lower method. 706 """ 707 return Seq(str(self).upper(), self.alphabet._upper())
708
709 - def lower(self):
710 """Returns a lower case copy of the sequence. 711 712 This will adjust the alphabet if required. Note that the IUPAC alphabets 713 are upper case only, and thus a generic alphabet must be substituted. 714 715 >>> from Bio.Alphabet import Gapped, generic_dna 716 >>> from Bio.Alphabet import IUPAC 717 >>> from Bio.Seq import Seq 718 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 719 >>> my_seq 720 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 721 >>> my_seq.lower() 722 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 723 724 See also the upper method. 725 """ 726 return Seq(str(self).lower(), self.alphabet._lower())
727
728 - def complement(self):
729 """Returns the complement sequence. New Seq object. 730 731 >>> from Bio.Seq import Seq 732 >>> from Bio.Alphabet import IUPAC 733 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 734 >>> my_dna 735 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 736 >>> my_dna.complement() 737 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 738 739 You can of course used mixed case sequences, 740 741 >>> from Bio.Seq import Seq 742 >>> from Bio.Alphabet import generic_dna 743 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 744 >>> my_dna 745 Seq('CCCCCgatA-GD', DNAAlphabet()) 746 >>> my_dna.complement() 747 Seq('GGGGGctaT-CH', DNAAlphabet()) 748 749 Note in the above example, ambiguous character D denotes 750 G, A or T so its complement is H (for C, T or A). 751 752 Trying to complement a protein sequence raises an exception. 753 754 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 755 >>> my_protein.complement() 756 Traceback (most recent call last): 757 ... 758 ValueError: Proteins do not have complements! 759 """ 760 base = Alphabet._get_base_alphabet(self.alphabet) 761 if isinstance(base, Alphabet.ProteinAlphabet): 762 raise ValueError("Proteins do not have complements!") 763 if isinstance(base, Alphabet.DNAAlphabet): 764 ttable = _dna_complement_table 765 elif isinstance(base, Alphabet.RNAAlphabet): 766 ttable = _rna_complement_table 767 elif ('U' in self._data or 'u' in self._data) \ 768 and ('T' in self._data or 't' in self._data): 769 # TODO - Handle this cleanly? 770 raise ValueError("Mixed RNA/DNA found") 771 elif 'U' in self._data or 'u' in self._data: 772 ttable = _rna_complement_table 773 else: 774 ttable = _dna_complement_table 775 # Much faster on really long sequences than the previous loop based one. 776 # thx to Michael Palmer, University of Waterloo 777 return Seq(str(self).translate(ttable), self.alphabet)
778
779 - def reverse_complement(self):
780 """Returns the reverse complement sequence. New Seq object. 781 782 >>> from Bio.Seq import Seq 783 >>> from Bio.Alphabet import IUPAC 784 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 785 >>> my_dna 786 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 787 >>> my_dna.reverse_complement() 788 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 789 790 Note in the above example, since R = G or A, its complement 791 is Y (which denotes C or T). 792 793 You can of course used mixed case sequences, 794 795 >>> from Bio.Seq import Seq 796 >>> from Bio.Alphabet import generic_dna 797 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 798 >>> my_dna 799 Seq('CCCCCgatA-G', DNAAlphabet()) 800 >>> my_dna.reverse_complement() 801 Seq('C-TatcGGGGG', DNAAlphabet()) 802 803 Trying to complement a protein sequence raises an exception: 804 805 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 806 >>> my_protein.reverse_complement() 807 Traceback (most recent call last): 808 ... 809 ValueError: Proteins do not have complements! 810 """ 811 # Use -1 stride/step to reverse the complement 812 return self.complement()[::-1]
813
814 - def transcribe(self):
815 """Returns the RNA sequence from a DNA sequence. New Seq object. 816 817 >>> from Bio.Seq import Seq 818 >>> from Bio.Alphabet import IUPAC 819 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 820 ... IUPAC.unambiguous_dna) 821 >>> coding_dna 822 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 823 >>> coding_dna.transcribe() 824 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 825 826 Trying to transcribe a protein or RNA sequence raises an exception: 827 828 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 829 >>> my_protein.transcribe() 830 Traceback (most recent call last): 831 ... 832 ValueError: Proteins cannot be transcribed! 833 """ 834 base = Alphabet._get_base_alphabet(self.alphabet) 835 if isinstance(base, Alphabet.ProteinAlphabet): 836 raise ValueError("Proteins cannot be transcribed!") 837 if isinstance(base, Alphabet.RNAAlphabet): 838 raise ValueError("RNA cannot be transcribed!") 839 840 if self.alphabet == IUPAC.unambiguous_dna: 841 alphabet = IUPAC.unambiguous_rna 842 elif self.alphabet == IUPAC.ambiguous_dna: 843 alphabet = IUPAC.ambiguous_rna 844 else: 845 alphabet = Alphabet.generic_rna 846 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
847
848 - def back_transcribe(self):
849 """Returns the DNA sequence from an RNA sequence. New Seq object. 850 851 >>> from Bio.Seq import Seq 852 >>> from Bio.Alphabet import IUPAC 853 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 854 ... IUPAC.unambiguous_rna) 855 >>> messenger_rna 856 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 857 >>> messenger_rna.back_transcribe() 858 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 859 860 Trying to back-transcribe a protein or DNA sequence raises an 861 exception: 862 863 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 864 >>> my_protein.back_transcribe() 865 Traceback (most recent call last): 866 ... 867 ValueError: Proteins cannot be back transcribed! 868 """ 869 base = Alphabet._get_base_alphabet(self.alphabet) 870 if isinstance(base, Alphabet.ProteinAlphabet): 871 raise ValueError("Proteins cannot be back transcribed!") 872 if isinstance(base, Alphabet.DNAAlphabet): 873 raise ValueError("DNA cannot be back transcribed!") 874 875 if self.alphabet == IUPAC.unambiguous_rna: 876 alphabet = IUPAC.unambiguous_dna 877 elif self.alphabet == IUPAC.ambiguous_rna: 878 alphabet = IUPAC.ambiguous_dna 879 else: 880 alphabet = Alphabet.generic_dna 881 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
882
883 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 884 cds=False):
885 """Turns a nucleotide sequence into a protein sequence. New Seq object. 886 887 This method will translate DNA or RNA sequences, and those with a 888 nucleotide or generic alphabet. Trying to translate a protein 889 sequence raises an exception. 890 891 Arguments: 892 - table - Which codon table to use? This can be either a name 893 (string), an NCBI identifier (integer), or a CodonTable 894 object (useful for non-standard genetic codes). This 895 defaults to the "Standard" table. 896 - stop_symbol - Single character string, what to use for terminators. 897 This defaults to the asterisk, "*". 898 - to_stop - Boolean, defaults to False meaning do a full translation 899 continuing on past any stop codons (translated as the 900 specified stop_symbol). If True, translation is 901 terminated at the first in frame stop codon (and the 902 stop_symbol is not appended to the returned protein 903 sequence). 904 - cds - Boolean, indicates this is a complete CDS. If True, 905 this checks the sequence starts with a valid alternative start 906 codon (which will be translated as methionine, M), that the 907 sequence length is a multiple of three, and that there is a 908 single in frame stop codon at the end (this will be excluded 909 from the protein sequence, regardless of the to_stop option). 910 If these tests fail, an exception is raised. 911 912 e.g. Using the standard table: 913 914 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 915 >>> coding_dna.translate() 916 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 917 >>> coding_dna.translate(stop_symbol="@") 918 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 919 >>> coding_dna.translate(to_stop=True) 920 Seq('VAIVMGR', ExtendedIUPACProtein()) 921 922 Now using NCBI table 2, where TGA is not a stop codon: 923 924 >>> coding_dna.translate(table=2) 925 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 926 >>> coding_dna.translate(table=2, to_stop=True) 927 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 928 929 In fact, GTG is an alternative start codon under NCBI table 2, meaning 930 this sequence could be a complete CDS: 931 932 >>> coding_dna.translate(table=2, cds=True) 933 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 934 935 It isn't a valid CDS under NCBI table 1, due to both the start codon and 936 also the in frame stop codons: 937 938 >>> coding_dna.translate(table=1, cds=True) 939 Traceback (most recent call last): 940 ... 941 TranslationError: First codon 'GTG' is not a start codon 942 943 If the sequence has no in-frame stop codon, then the to_stop argument 944 has no effect: 945 946 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 947 >>> coding_dna2.translate() 948 Seq('LAIVMGR', ExtendedIUPACProtein()) 949 >>> coding_dna2.translate(to_stop=True) 950 Seq('LAIVMGR', ExtendedIUPACProtein()) 951 952 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 953 or a stop codon. These are translated as "X". Any invalid codon 954 (e.g. "TA?" or "T-A") will throw a TranslationError. 955 956 NOTE - Does NOT support gapped sequences. 957 958 NOTE - This does NOT behave like the python string's translate 959 method. For that use str(my_seq).translate(...) instead. 960 """ 961 if isinstance(table, str) and len(table) == 256: 962 raise ValueError("The Seq object translate method DOES NOT take " 963 "a 256 character string mapping table like " 964 "the python string object's translate method. " 965 "Use str(my_seq).translate(...) instead.") 966 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 967 Alphabet.ProteinAlphabet): 968 raise ValueError("Proteins cannot be translated!") 969 try: 970 table_id = int(table) 971 except ValueError: 972 # Assume its a table name 973 if self.alphabet == IUPAC.unambiguous_dna: 974 # Will use standard IUPAC protein alphabet, no need for X 975 codon_table = CodonTable.unambiguous_dna_by_name[table] 976 elif self.alphabet == IUPAC.unambiguous_rna: 977 # Will use standard IUPAC protein alphabet, no need for X 978 codon_table = CodonTable.unambiguous_rna_by_name[table] 979 else: 980 # This will use the extended IUPAC protein alphabet with X etc. 981 # The same table can be used for RNA or DNA (we use this for 982 # translating strings). 983 codon_table = CodonTable.ambiguous_generic_by_name[table] 984 except (AttributeError, TypeError): 985 # Assume its a CodonTable object 986 if isinstance(table, CodonTable.CodonTable): 987 codon_table = table 988 else: 989 raise ValueError('Bad table argument') 990 else: 991 # Assume its a table ID 992 if self.alphabet == IUPAC.unambiguous_dna: 993 # Will use standard IUPAC protein alphabet, no need for X 994 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 995 elif self.alphabet == IUPAC.unambiguous_rna: 996 # Will use standard IUPAC protein alphabet, no need for X 997 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 998 else: 999 # This will use the extended IUPAC protein alphabet with X etc. 1000 # The same table can be used for RNA or DNA (we use this for 1001 # translating strings). 1002 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 1003 protein = _translate_str(str(self), codon_table, 1004 stop_symbol, to_stop, cds) 1005 if stop_symbol in protein: 1006 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 1007 stop_symbol=stop_symbol) 1008 else: 1009 alphabet = codon_table.protein_alphabet 1010 return Seq(protein, alphabet)
1011
1012 - def ungap(self, gap=None):
1013 """Return a copy of the sequence without the gap character(s). 1014 1015 The gap character can be specified in two ways - either as an explicit 1016 argument, or via the sequence's alphabet. For example: 1017 1018 >>> from Bio.Seq import Seq 1019 >>> from Bio.Alphabet import generic_dna 1020 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1021 >>> my_dna 1022 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1023 >>> my_dna.ungap("-") 1024 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1025 1026 If the gap character is not given as an argument, it will be taken from 1027 the sequence's alphabet (if defined). Notice that the returned sequence's 1028 alphabet is adjusted since it no longer requires a gapped alphabet: 1029 1030 >>> from Bio.Seq import Seq 1031 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1032 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1033 >>> my_pro 1034 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1035 >>> my_pro.ungap() 1036 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1037 1038 Or, with a simpler gapped DNA example: 1039 1040 >>> from Bio.Seq import Seq 1041 >>> from Bio.Alphabet import IUPAC, Gapped 1042 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1043 >>> my_seq 1044 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1045 >>> my_seq.ungap() 1046 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1047 1048 As long as it is consistent with the alphabet, although it is redundant, 1049 you can still supply the gap character as an argument to this method: 1050 1051 >>> my_seq 1052 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1053 >>> my_seq.ungap("=") 1054 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1055 1056 However, if the gap character given as the argument disagrees with that 1057 declared in the alphabet, an exception is raised: 1058 1059 >>> my_seq 1060 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1061 >>> my_seq.ungap("-") 1062 Traceback (most recent call last): 1063 ... 1064 ValueError: Gap '-' does not match '=' from alphabet 1065 1066 Finally, if a gap character is not supplied, and the alphabet does not 1067 define one, an exception is raised: 1068 1069 >>> from Bio.Seq import Seq 1070 >>> from Bio.Alphabet import generic_dna 1071 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1072 >>> my_dna 1073 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1074 >>> my_dna.ungap() 1075 Traceback (most recent call last): 1076 ... 1077 ValueError: Gap character not given and not defined in alphabet 1078 1079 """ 1080 if hasattr(self.alphabet, "gap_char"): 1081 if not gap: 1082 gap = self.alphabet.gap_char 1083 elif gap != self.alphabet.gap_char: 1084 raise ValueError("Gap {0!r} does not match {1!r} from alphabet".format( 1085 gap, self.alphabet.gap_char)) 1086 alpha = Alphabet._ungap(self.alphabet) 1087 elif not gap: 1088 raise ValueError("Gap character not given and not defined in alphabet") 1089 else: 1090 alpha = self.alphabet # modify! 1091 if len(gap) != 1 or not isinstance(gap, str): 1092 raise ValueError("Unexpected gap character, {0!r}".format(gap)) 1093 return Seq(str(self).replace(gap, ""), alpha)
1094 1095
1096 -class UnknownSeq(Seq):
1097 """A read-only sequence object of known length but unknown contents. 1098 1099 If you have an unknown sequence, you can represent this with a normal 1100 Seq object, for example: 1101 1102 >>> my_seq = Seq("N"*5) 1103 >>> my_seq 1104 Seq('NNNNN', Alphabet()) 1105 >>> len(my_seq) 1106 5 1107 >>> print(my_seq) 1108 NNNNN 1109 1110 However, this is rather wasteful of memory (especially for large 1111 sequences), which is where this class is most usefull: 1112 1113 >>> unk_five = UnknownSeq(5) 1114 >>> unk_five 1115 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1116 >>> len(unk_five) 1117 5 1118 >>> print(unk_five) 1119 ????? 1120 1121 You can add unknown sequence together, provided their alphabets and 1122 characters are compatible, and get another memory saving UnknownSeq: 1123 1124 >>> unk_four = UnknownSeq(4) 1125 >>> unk_four 1126 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1127 >>> unk_four + unk_five 1128 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1129 1130 If the alphabet or characters don't match up, the addition gives an 1131 ordinary Seq object: 1132 1133 >>> unk_nnnn = UnknownSeq(4, character = "N") 1134 >>> unk_nnnn 1135 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1136 >>> unk_nnnn + unk_four 1137 Seq('NNNN????', Alphabet()) 1138 1139 Combining with a real Seq gives a new Seq object: 1140 1141 >>> known_seq = Seq("ACGT") 1142 >>> unk_four + known_seq 1143 Seq('????ACGT', Alphabet()) 1144 >>> known_seq + unk_four 1145 Seq('ACGT????', Alphabet()) 1146 """
1147 - def __init__(self, length, alphabet=Alphabet.generic_alphabet, character=None):
1148 """Create a new UnknownSeq object. 1149 1150 If character is omitted, it is determined from the alphabet, "N" for 1151 nucleotides, "X" for proteins, and "?" otherwise. 1152 """ 1153 self._length = int(length) 1154 if self._length < 0: 1155 # TODO - Block zero length UnknownSeq? You can just use a Seq! 1156 raise ValueError("Length must not be negative.") 1157 self.alphabet = alphabet 1158 if character: 1159 if len(character) != 1: 1160 raise ValueError("character argument should be a single letter string.") 1161 self._character = character 1162 else: 1163 base = Alphabet._get_base_alphabet(alphabet) 1164 # TODO? Check the case of the letters in the alphabet? 1165 # We may have to use "n" instead of "N" etc. 1166 if isinstance(base, Alphabet.NucleotideAlphabet): 1167 self._character = "N" 1168 elif isinstance(base, Alphabet.ProteinAlphabet): 1169 self._character = "X" 1170 else: 1171 self._character = "?"
1172
1173 - def __len__(self):
1174 """Returns the stated length of the unknown sequence.""" 1175 return self._length
1176
1177 - def __str__(self):
1178 """Returns the unknown sequence as full string of the given length.""" 1179 return self._character * self._length
1180
1181 - def __repr__(self):
1182 return "UnknownSeq({0}, alphabet = {1!r}, character = {2!r})".format( 1183 self._length, self.alphabet, self._character)
1184
1185 - def __add__(self, other):
1186 """Add another sequence or string to this sequence. 1187 1188 Adding two UnknownSeq objects returns another UnknownSeq object 1189 provided the character is the same and the alphabets are compatible. 1190 1191 >>> from Bio.Seq import UnknownSeq 1192 >>> from Bio.Alphabet import generic_protein 1193 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1194 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1195 1196 If the characters differ, an UnknownSeq object cannot be used, so a 1197 Seq object is returned: 1198 1199 >>> from Bio.Seq import UnknownSeq 1200 >>> from Bio.Alphabet import generic_protein 1201 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1202 ... character="x") 1203 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1204 1205 If adding a string to an UnknownSeq, a new Seq is returned with the 1206 same alphabet: 1207 1208 >>> from Bio.Seq import UnknownSeq 1209 >>> from Bio.Alphabet import generic_protein 1210 >>> UnknownSeq(5, generic_protein) + "LV" 1211 Seq('XXXXXLV', ProteinAlphabet()) 1212 """ 1213 if isinstance(other, UnknownSeq) and other._character == self._character: 1214 # TODO - Check the alphabets match 1215 return UnknownSeq(len(self) + len(other), 1216 self.alphabet, self._character) 1217 # Offload to the base class... 1218 return Seq(str(self), self.alphabet) + other
1219
1220 - def __radd__(self, other):
1221 # If other is an UnknownSeq, then __add__ would be called. 1222 # Offload to the base class... 1223 return other + Seq(str(self), self.alphabet)
1224
1225 - def __getitem__(self, index):
1226 """Get a subsequence from the UnknownSeq object. 1227 1228 >>> unk = UnknownSeq(8, character="N") 1229 >>> print(unk[:]) 1230 NNNNNNNN 1231 >>> print(unk[5:3]) 1232 <BLANKLINE> 1233 >>> print(unk[1:-1]) 1234 NNNNNN 1235 >>> print(unk[1:-1:2]) 1236 NNN 1237 """ 1238 if isinstance(index, int): 1239 # TODO - Check the bounds without wasting memory 1240 return str(self)[index] 1241 old_length = self._length 1242 step = index.step 1243 if step is None or step == 1: 1244 # This calculates the length you'd get from ("N"*old_length)[index] 1245 start = index.start 1246 end = index.stop 1247 if start is None: 1248 start = 0 1249 elif start < 0: 1250 start = max(0, old_length + start) 1251 elif start > old_length: 1252 start = old_length 1253 if end is None: 1254 end = old_length 1255 elif end < 0: 1256 end = max(0, old_length + end) 1257 elif end > old_length: 1258 end = old_length 1259 new_length = max(0, end - start) 1260 elif step == 0: 1261 raise ValueError("slice step cannot be zero") 1262 else: 1263 # TODO - handle step efficiently 1264 new_length = len(("X" * old_length)[index]) 1265 # assert new_length == len(("X"*old_length)[index]), \ 1266 # (index, start, end, step, old_length, 1267 # new_length, len(("X"*old_length)[index])) 1268 return UnknownSeq(new_length, self.alphabet, self._character)
1269
1270 - def count(self, sub, start=0, end=sys.maxsize):
1271 """Non-overlapping count method, like that of a python string. 1272 1273 This behaves like the python string (and Seq object) method of the 1274 same name, which does a non-overlapping count! 1275 1276 Returns an integer, the number of occurrences of substring 1277 argument sub in the (sub)sequence given by [start:end]. 1278 Optional arguments start and end are interpreted as in slice 1279 notation. 1280 1281 Arguments: 1282 - sub - a string or another Seq object to look for 1283 - start - optional integer, slice start 1284 - end - optional integer, slice end 1285 1286 >>> "NNNN".count("N") 1287 4 1288 >>> Seq("NNNN").count("N") 1289 4 1290 >>> UnknownSeq(4, character="N").count("N") 1291 4 1292 >>> UnknownSeq(4, character="N").count("A") 1293 0 1294 >>> UnknownSeq(4, character="N").count("AA") 1295 0 1296 1297 HOWEVER, please note because that python strings and Seq objects (and 1298 MutableSeq objects) do a non-overlapping search, this may not give 1299 the answer you expect: 1300 1301 >>> UnknownSeq(4, character="N").count("NN") 1302 2 1303 >>> UnknownSeq(4, character="N").count("NNN") 1304 1 1305 """ 1306 sub_str = self._get_seq_str_and_check_alphabet(sub) 1307 if len(sub_str) == 1: 1308 if str(sub_str) == self._character: 1309 if start == 0 and end >= self._length: 1310 return self._length 1311 else: 1312 # This could be done more cleverly... 1313 return str(self).count(sub_str, start, end) 1314 else: 1315 return 0 1316 else: 1317 if set(sub_str) == set(self._character): 1318 if start == 0 and end >= self._length: 1319 return self._length // len(sub_str) 1320 else: 1321 # This could be done more cleverly... 1322 return str(self).count(sub_str, start, end) 1323 else: 1324 return 0
1325
1326 - def complement(self):
1327 """The complement of an unknown nucleotide equals itself. 1328 1329 >>> my_nuc = UnknownSeq(8) 1330 >>> my_nuc 1331 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1332 >>> print(my_nuc) 1333 ???????? 1334 >>> my_nuc.complement() 1335 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1336 >>> print(my_nuc.complement()) 1337 ???????? 1338 """ 1339 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1340 Alphabet.ProteinAlphabet): 1341 raise ValueError("Proteins do not have complements!") 1342 return self
1343
1344 - def reverse_complement(self):
1345 """The reverse complement of an unknown nucleotide equals itself. 1346 1347 >>> my_nuc = UnknownSeq(10) 1348 >>> my_nuc 1349 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1350 >>> print(my_nuc) 1351 ?????????? 1352 >>> my_nuc.reverse_complement() 1353 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1354 >>> print(my_nuc.reverse_complement()) 1355 ?????????? 1356 """ 1357 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1358 Alphabet.ProteinAlphabet): 1359 raise ValueError("Proteins do not have complements!") 1360 return self
1361
1362 - def transcribe(self):
1363 """Returns unknown RNA sequence from an unknown DNA sequence. 1364 1365 >>> my_dna = UnknownSeq(10, character="N") 1366 >>> my_dna 1367 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1368 >>> print(my_dna) 1369 NNNNNNNNNN 1370 >>> my_rna = my_dna.transcribe() 1371 >>> my_rna 1372 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1373 >>> print(my_rna) 1374 NNNNNNNNNN 1375 """ 1376 # Offload the alphabet stuff 1377 s = Seq(self._character, self.alphabet).transcribe() 1378 return UnknownSeq(self._length, s.alphabet, self._character)
1379
1380 - def back_transcribe(self):
1381 """Returns unknown DNA sequence from an unknown RNA sequence. 1382 1383 >>> my_rna = UnknownSeq(20, character="N") 1384 >>> my_rna 1385 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1386 >>> print(my_rna) 1387 NNNNNNNNNNNNNNNNNNNN 1388 >>> my_dna = my_rna.back_transcribe() 1389 >>> my_dna 1390 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1391 >>> print(my_dna) 1392 NNNNNNNNNNNNNNNNNNNN 1393 """ 1394 # Offload the alphabet stuff 1395 s = Seq(self._character, self.alphabet).back_transcribe() 1396 return UnknownSeq(self._length, s.alphabet, self._character)
1397
1398 - def upper(self):
1399 """Returns an upper case copy of the sequence. 1400 1401 >>> from Bio.Alphabet import generic_dna 1402 >>> from Bio.Seq import UnknownSeq 1403 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1404 >>> my_seq 1405 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1406 >>> print(my_seq) 1407 nnnnnnnnnnnnnnnnnnnn 1408 >>> my_seq.upper() 1409 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1410 >>> print(my_seq.upper()) 1411 NNNNNNNNNNNNNNNNNNNN 1412 1413 This will adjust the alphabet if required. See also the lower method. 1414 """ 1415 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1416
1417 - def lower(self):
1418 """Returns a lower case copy of the sequence. 1419 1420 This will adjust the alphabet if required: 1421 1422 >>> from Bio.Alphabet import IUPAC 1423 >>> from Bio.Seq import UnknownSeq 1424 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1425 >>> my_seq 1426 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1427 >>> print(my_seq) 1428 XXXXXXXXXXXXXXXXXXXX 1429 >>> my_seq.lower() 1430 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1431 >>> print(my_seq.lower()) 1432 xxxxxxxxxxxxxxxxxxxx 1433 1434 See also the upper method. 1435 """ 1436 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1437
1438 - def translate(self, **kwargs):
1439 """Translate an unknown nucleotide sequence into an unknown protein. 1440 1441 e.g. 1442 1443 >>> my_seq = UnknownSeq(9, character="N") 1444 >>> print(my_seq) 1445 NNNNNNNNN 1446 >>> my_protein = my_seq.translate() 1447 >>> my_protein 1448 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1449 >>> print(my_protein) 1450 XXX 1451 1452 In comparison, using a normal Seq object: 1453 1454 >>> my_seq = Seq("NNNNNNNNN") 1455 >>> print(my_seq) 1456 NNNNNNNNN 1457 >>> my_protein = my_seq.translate() 1458 >>> my_protein 1459 Seq('XXX', ExtendedIUPACProtein()) 1460 >>> print(my_protein) 1461 XXX 1462 1463 """ 1464 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1465 Alphabet.ProteinAlphabet): 1466 raise ValueError("Proteins cannot be translated!") 1467 return UnknownSeq(self._length // 3, Alphabet.generic_protein, "X")
1468
1469 - def ungap(self, gap=None):
1470 """Return a copy of the sequence without the gap character(s). 1471 1472 The gap character can be specified in two ways - either as an explicit 1473 argument, or via the sequence's alphabet. For example: 1474 1475 >>> from Bio.Seq import UnknownSeq 1476 >>> from Bio.Alphabet import Gapped, generic_dna 1477 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1478 >>> my_dna 1479 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1480 >>> my_dna.ungap() 1481 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1482 >>> my_dna.ungap("-") 1483 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1484 1485 If the UnknownSeq is using the gap character, then an empty Seq is 1486 returned: 1487 1488 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1489 >>> my_gap 1490 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1491 >>> my_gap.ungap() 1492 Seq('', DNAAlphabet()) 1493 >>> my_gap.ungap("-") 1494 Seq('', DNAAlphabet()) 1495 1496 Notice that the returned sequence's alphabet is adjusted to remove any 1497 explicit gap character declaration. 1498 """ 1499 # Offload the alphabet stuff 1500 s = Seq(self._character, self.alphabet).ungap() 1501 if s: 1502 return UnknownSeq(self._length, s.alphabet, self._character) 1503 else: 1504 return Seq("", s.alphabet)
1505 1506
1507 -class MutableSeq(object):
1508 """An editable sequence object (with an alphabet). 1509 1510 Unlike normal python strings and our basic sequence object (the Seq class) 1511 which are immutable, the MutableSeq lets you edit the sequence in place. 1512 However, this means you cannot use a MutableSeq object as a dictionary key. 1513 1514 >>> from Bio.Seq import MutableSeq 1515 >>> from Bio.Alphabet import generic_dna 1516 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1517 >>> my_seq 1518 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1519 >>> my_seq[5] 1520 'T' 1521 >>> my_seq[5] = "A" 1522 >>> my_seq 1523 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1524 >>> my_seq[5] 1525 'A' 1526 >>> my_seq[5:8] = "NNN" 1527 >>> my_seq 1528 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1529 >>> len(my_seq) 1530 11 1531 1532 Note that the MutableSeq object does not support as many string-like 1533 or biological methods as the Seq object. 1534 """
1535 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
1536 if sys.version_info[0] == 3: 1537 self.array_indicator = "u" 1538 else: 1539 self.array_indicator = "c" 1540 if isinstance(data, str): # TODO - What about unicode? 1541 self.data = array.array(self.array_indicator, data) 1542 else: 1543 self.data = data # assumes the input is an array 1544 self.alphabet = alphabet
1545
1546 - def __repr__(self):
1547 """Returns a (truncated) representation of the sequence for debugging.""" 1548 if len(self) > 60: 1549 # Shows the last three letters as it is often useful to see if there 1550 # is a stop codon at the end of a sequence. 1551 # Note total length is 54+3+3=60 1552 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 1553 str(self[:54]), 1554 str(self[-3:]), 1555 self.alphabet) 1556 else: 1557 return "{0}('{1}', {2!r})".format(self.__class__.__name__, 1558 str(self), 1559 self.alphabet)
1560
1561 - def __str__(self):
1562 """Returns the full sequence as a python string. 1563 1564 Note that Biopython 1.44 and earlier would give a truncated 1565 version of repr(my_seq) for str(my_seq). If you are writing code 1566 which needs to be backwards compatible with old Biopython, you 1567 should continue to use my_seq.tostring() rather than str(my_seq). 1568 """ 1569 # See test_GAQueens.py for an historic usage of a non-string alphabet! 1570 return "".join(self.data)
1571
1572 - def __eq__(self, other):
1573 """Compare the sequence to another sequence or a string (README). 1574 1575 Currently if compared to another sequence the alphabets must be 1576 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1577 an exception. Otherwise only the sequence itself is compared, not the 1578 precise alphabet. 1579 1580 A future release of Biopython will change this (and the Seq object etc) 1581 to use simple string comparison. The plan is that comparing sequences 1582 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1583 but not an exception. 1584 1585 During this transition period, please just do explicit comparisons: 1586 1587 >>> seq1 = MutableSeq("ACGT") 1588 >>> seq2 = MutableSeq("ACGT") 1589 >>> id(seq1) == id(seq2) 1590 False 1591 >>> str(seq1) == str(seq2) 1592 True 1593 1594 Biopython now does: 1595 1596 >>> seq1 == seq2 1597 True 1598 >>> seq1 == Seq("ACGT") 1599 True 1600 >>> seq1 == "ACGT" 1601 True 1602 1603 """ 1604 if hasattr(other, "alphabet"): 1605 if not Alphabet._check_type_compatible([self.alphabet, 1606 other.alphabet]): 1607 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1608 self.alphabet, other.alphabet), 1609 BiopythonWarning) 1610 if isinstance(other, MutableSeq): 1611 return self.data == other.data 1612 return str(self) == str(other)
1613
1614 - def __ne__(self, other):
1615 """Not equal, see __eq__ documentation.""" 1616 # Seem to require this method under Python 2 but not needed on Python 3? 1617 return not (self == other)
1618
1619 - def __lt__(self, other):
1620 """Less than, see __eq__ documentation.""" 1621 if hasattr(other, "alphabet"): 1622 if not Alphabet._check_type_compatible([self.alphabet, 1623 other.alphabet]): 1624 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1625 self.alphabet, other.alphabet), 1626 BiopythonWarning) 1627 if isinstance(other, MutableSeq): 1628 return self.data < other.data 1629 return str(self) < str(other)
1630
1631 - def __le__(self, other):
1632 """Less than or equal, see __eq__ documentation.""" 1633 if hasattr(other, "alphabet"): 1634 if not Alphabet._check_type_compatible([self.alphabet, 1635 other.alphabet]): 1636 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1637 self.alphabet, other.alphabet), 1638 BiopythonWarning) 1639 if isinstance(other, MutableSeq): 1640 return self.data <= other.data 1641 return str(self) <= str(other)
1642
1643 - def __len__(self):
1644 return len(self.data)
1645
1646 - def __getitem__(self, index):
1647 # Note since Python 2.0, __getslice__ is deprecated 1648 # and __getitem__ is used instead. 1649 # See http://docs.python.org/ref/sequence-methods.html 1650 if isinstance(index, int): 1651 # Return a single letter as a string 1652 return self.data[index] 1653 else: 1654 # Return the (sub)sequence as another Seq object 1655 return MutableSeq(self.data[index], self.alphabet)
1656
1657 - def __setitem__(self, index, value):
1658 # Note since Python 2.0, __setslice__ is deprecated 1659 # and __setitem__ is used instead. 1660 # See http://docs.python.org/ref/sequence-methods.html 1661 if isinstance(index, int): 1662 # Replacing a single letter with a new string 1663 self.data[index] = value 1664 else: 1665 # Replacing a sub-sequence 1666 if isinstance(value, MutableSeq): 1667 self.data[index] = value.data 1668 elif isinstance(value, type(self.data)): 1669 self.data[index] = value 1670 else: 1671 self.data[index] = array.array(self.array_indicator, 1672 str(value))
1673
1674 - def __delitem__(self, index):
1675 # Note since Python 2.0, __delslice__ is deprecated 1676 # and __delitem__ is used instead. 1677 # See http://docs.python.org/ref/sequence-methods.html 1678 1679 # Could be deleting a single letter, or a slice 1680 del self.data[index]
1681
1682 - def __add__(self, other):
1683 """Add another sequence or string to this sequence. 1684 1685 Returns a new MutableSeq object.""" 1686 if hasattr(other, "alphabet"): 1687 # other should be a Seq or a MutableSeq 1688 if not Alphabet._check_type_compatible([self.alphabet, 1689 other.alphabet]): 1690 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 1691 self.alphabet, other.alphabet)) 1692 # They should be the same sequence type (or one of them is generic) 1693 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1694 if isinstance(other, MutableSeq): 1695 # See test_GAQueens.py for an historic usage of a non-string 1696 # alphabet! Adding the arrays should support this. 1697 return self.__class__(self.data + other.data, a) 1698 else: 1699 return self.__class__(str(self) + str(other), a) 1700 elif isinstance(other, basestring): 1701 # other is a plain string - use the current alphabet 1702 return self.__class__(str(self) + str(other), self.alphabet) 1703 else: 1704 raise TypeError
1705
1706 - def __radd__(self, other):
1707 if hasattr(other, "alphabet"): 1708 # other should be a Seq or a MutableSeq 1709 if not Alphabet._check_type_compatible([self.alphabet, 1710 other.alphabet]): 1711 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 1712 self.alphabet, other.alphabet)) 1713 # They should be the same sequence type (or one of them is generic) 1714 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1715 if isinstance(other, MutableSeq): 1716 # See test_GAQueens.py for an historic usage of a non-string 1717 # alphabet! Adding the arrays should support this. 1718 return self.__class__(other.data + self.data, a) 1719 else: 1720 return self.__class__(str(other) + str(self), a) 1721 elif isinstance(other, basestring): 1722 # other is a plain string - use the current alphabet 1723 return self.__class__(str(other) + str(self), self.alphabet) 1724 else: 1725 raise TypeError
1726
1727 - def append(self, c):
1728 self.data.append(c)
1729
1730 - def insert(self, i, c):
1731 self.data.insert(i, c)
1732
1733 - def pop(self, i=(-1)):
1734 c = self.data[i] 1735 del self.data[i] 1736 return c
1737
1738 - def remove(self, item):
1739 for i in range(len(self.data)): 1740 if self.data[i] == item: 1741 del self.data[i] 1742 return 1743 raise ValueError("MutableSeq.remove(x): x not in list")
1744
1745 - def count(self, sub, start=0, end=sys.maxsize):
1746 """Non-overlapping count method, like that of a python string. 1747 1748 This behaves like the python string method of the same name, 1749 which does a non-overlapping count! 1750 1751 Returns an integer, the number of occurrences of substring 1752 argument sub in the (sub)sequence given by [start:end]. 1753 Optional arguments start and end are interpreted as in slice 1754 notation. 1755 1756 Arguments: 1757 - sub - a string or another Seq object to look for 1758 - start - optional integer, slice start 1759 - end - optional integer, slice end 1760 1761 e.g. 1762 1763 >>> from Bio.Seq import MutableSeq 1764 >>> my_mseq = MutableSeq("AAAATGA") 1765 >>> print(my_mseq.count("A")) 1766 5 1767 >>> print(my_mseq.count("ATG")) 1768 1 1769 >>> print(my_mseq.count(Seq("AT"))) 1770 1 1771 >>> print(my_mseq.count("AT", 2, -1)) 1772 1 1773 1774 HOWEVER, please note because that python strings, Seq objects and 1775 MutableSeq objects do a non-overlapping search, this may not give 1776 the answer you expect: 1777 1778 >>> "AAAA".count("AA") 1779 2 1780 >>> print(MutableSeq("AAAA").count("AA")) 1781 2 1782 1783 An overlapping search would give the answer as three! 1784 """ 1785 try: 1786 # TODO - Should we check the alphabet? 1787 search = str(sub) 1788 except AttributeError: 1789 search = sub 1790 1791 if not isinstance(search, basestring): 1792 raise TypeError("expected a string, Seq or MutableSeq") 1793 1794 if len(search) == 1: 1795 # Try and be efficient and work directly from the array. 1796 count = 0 1797 for c in self.data[start:end]: 1798 if c == search: 1799 count += 1 1800 return count 1801 else: 1802 # TODO - Can we do this more efficiently? 1803 return str(self).count(search, start, end)
1804
1805 - def index(self, item):
1806 for i in range(len(self.data)): 1807 if self.data[i] == item: 1808 return i 1809 raise ValueError("MutableSeq.index(x): x not in list")
1810
1811 - def reverse(self):
1812 """Modify the mutable sequence to reverse itself. 1813 1814 No return value. 1815 """ 1816 self.data.reverse()
1817
1818 - def complement(self):
1819 """Modify the mutable sequence to take on its complement. 1820 1821 Trying to complement a protein sequence raises an exception. 1822 1823 No return value. 1824 """ 1825 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1826 Alphabet.ProteinAlphabet): 1827 raise ValueError("Proteins do not have complements!") 1828 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1829 d = ambiguous_dna_complement 1830 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1831 d = ambiguous_rna_complement 1832 elif 'U' in self.data and 'T' in self.data: 1833 # TODO - Handle this cleanly? 1834 raise ValueError("Mixed RNA/DNA found") 1835 elif 'U' in self.data: 1836 d = ambiguous_rna_complement 1837 else: 1838 d = ambiguous_dna_complement 1839 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 1840 d.update(c) 1841 self.data = [d[c] for c in self.data] 1842 self.data = array.array(self.array_indicator, self.data)
1843
1844 - def reverse_complement(self):
1845 """Modify the mutable sequence to take on its reverse complement. 1846 1847 Trying to reverse complement a protein sequence raises an exception. 1848 1849 No return value. 1850 """ 1851 self.complement() 1852 self.data.reverse()
1853 1854 # Sorting a sequence makes no sense. 1855 # def sort(self, *args): self.data.sort(*args) 1856
1857 - def extend(self, other):
1858 if isinstance(other, MutableSeq): 1859 for c in other.data: 1860 self.data.append(c) 1861 else: 1862 for c in other: 1863 self.data.append(c)
1864
1865 - def tostring(self):
1866 """Returns the full sequence as a python string (DEPRECATED). 1867 1868 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 1869 as this method is officially deprecated. 1870 1871 Because str(my_seq) will give you the full sequence as a python string, 1872 there is often no need to make an explicit conversion. For example, 1873 1874 print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 1875 1876 On Biopython 1.44 or older you would have to have done this: 1877 1878 print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 1879 """ 1880 from Bio import BiopythonDeprecationWarning 1881 warnings.warn("This method is obsolete; please use str(my_seq) " 1882 "instead of my_seq.tostring().", 1883 BiopythonDeprecationWarning) 1884 return "".join(self.data)
1885
1886 - def toseq(self):
1887 """Returns the full sequence as a new immutable Seq object. 1888 1889 >>> from Bio.Seq import Seq 1890 >>> from Bio.Alphabet import IUPAC 1891 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1892 ... IUPAC.protein) 1893 >>> my_mseq 1894 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1895 >>> my_mseq.toseq() 1896 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1897 1898 Note that the alphabet is preserved. 1899 """ 1900 return Seq("".join(self.data), self.alphabet)
1901 1902 1903 # The transcribe, backward_transcribe, and translate functions are 1904 # user-friendly versions of the corresponding functions in Bio.Transcribe 1905 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1906
1907 -def transcribe(dna):
1908 """Transcribes a DNA sequence into RNA. 1909 1910 If given a string, returns a new string object. 1911 1912 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1913 1914 Trying to transcribe a protein or RNA sequence raises an exception. 1915 1916 e.g. 1917 1918 >>> transcribe("ACTGN") 1919 'ACUGN' 1920 """ 1921 if isinstance(dna, Seq): 1922 return dna.transcribe() 1923 elif isinstance(dna, MutableSeq): 1924 return dna.toseq().transcribe() 1925 else: 1926 return dna.replace('T', 'U').replace('t', 'u')
1927 1928
1929 -def back_transcribe(rna):
1930 """Back-transcribes an RNA sequence into DNA. 1931 1932 If given a string, returns a new string object. 1933 1934 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1935 1936 Trying to transcribe a protein or DNA sequence raises an exception. 1937 1938 e.g. 1939 1940 >>> back_transcribe("ACUGN") 1941 'ACTGN' 1942 """ 1943 if isinstance(rna, Seq): 1944 return rna.back_transcribe() 1945 elif isinstance(rna, MutableSeq): 1946 return rna.toseq().back_transcribe() 1947 else: 1948 return rna.replace('U', 'T').replace('u', 't')
1949 1950
1951 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1952 cds=False, pos_stop="X"):
1953 """Helper function to translate a nucleotide string (PRIVATE). 1954 1955 Arguments: 1956 - sequence - a string 1957 - table - a CodonTable object (NOT a table name or id number) 1958 - stop_symbol - a single character string, what to use for terminators. 1959 - to_stop - boolean, should translation terminate at the first 1960 in frame stop codon? If there is no in-frame stop codon 1961 then translation continues to the end. 1962 - pos_stop - a single character string for a possible stop codon 1963 (e.g. TAN or NNN) 1964 - cds - Boolean, indicates this is a complete CDS. If True, this 1965 checks the sequence starts with a valid alternative start 1966 codon (which will be translated as methionine, M), that the 1967 sequence length is a multiple of three, and that there is a 1968 single in frame stop codon at the end (this will be excluded 1969 from the protein sequence, regardless of the to_stop option). 1970 If these tests fail, an exception is raised. 1971 1972 Returns a string. 1973 1974 e.g. 1975 1976 >>> from Bio.Data import CodonTable 1977 >>> table = CodonTable.ambiguous_dna_by_id[1] 1978 >>> _translate_str("AAA", table) 1979 'K' 1980 >>> _translate_str("TAR", table) 1981 '*' 1982 >>> _translate_str("TAN", table) 1983 'X' 1984 >>> _translate_str("TAN", table, pos_stop="@") 1985 '@' 1986 >>> _translate_str("TA?", table) 1987 Traceback (most recent call last): 1988 ... 1989 TranslationError: Codon 'TA?' is invalid 1990 1991 In a change to older verions of Biopython, partial codons are now 1992 always regarded as an error (previously only checked if cds=True) 1993 and will trigger a warning (likely to become an exception in a 1994 future release). 1995 1996 If **cds=True**, the start and stop codons are checked, and the start 1997 codon will be translated at methionine. The sequence must be an 1998 while number of codons. 1999 2000 >>> _translate_str("ATGCCCTAG", table, cds=True) 2001 'MP' 2002 >>> _translate_str("AAACCCTAG", table, cds=True) 2003 Traceback (most recent call last): 2004 ... 2005 TranslationError: First codon 'AAA' is not a start codon 2006 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 2007 Traceback (most recent call last): 2008 ... 2009 TranslationError: Extra in frame stop codon found. 2010 """ 2011 sequence = sequence.upper() 2012 amino_acids = [] 2013 forward_table = table.forward_table 2014 stop_codons = table.stop_codons 2015 if table.nucleotide_alphabet.letters is not None: 2016 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 2017 else: 2018 # Assume the worst case, ambiguous DNA or RNA: 2019 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 2020 IUPAC.ambiguous_rna.letters.upper()) 2021 n = len(sequence) 2022 if cds: 2023 if str(sequence[:3]).upper() not in table.start_codons: 2024 raise CodonTable.TranslationError( 2025 "First codon '{0}' is not a start codon".format(sequence[:3])) 2026 if n % 3 != 0: 2027 raise CodonTable.TranslationError( 2028 "Sequence length {0} is not a multiple of three".format(n)) 2029 if str(sequence[-3:]).upper() not in stop_codons: 2030 raise CodonTable.TranslationError( 2031 "Final codon '{0}' is not a stop codon".format(sequence[-3:])) 2032 # Don't translate the stop symbol, and manually translate the M 2033 sequence = sequence[3:-3] 2034 n -= 6 2035 amino_acids = ["M"] 2036 elif n % 3 != 0: 2037 from Bio import BiopythonWarning 2038 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 2039 "Explicitly trim the sequence or add trailing N before " 2040 "translation. This may become an error in future.", 2041 BiopythonWarning) 2042 for i in range(0, n - n % 3, 3): 2043 codon = sequence[i:i + 3] 2044 try: 2045 amino_acids.append(forward_table[codon]) 2046 except (KeyError, CodonTable.TranslationError): 2047 # TODO? Treat "---" as a special case (gapped translation) 2048 if codon in table.stop_codons: 2049 if cds: 2050 raise CodonTable.TranslationError( 2051 "Extra in frame stop codon found.") 2052 if to_stop: 2053 break 2054 amino_acids.append(stop_symbol) 2055 elif valid_letters.issuperset(set(codon)): 2056 # Possible stop codon (e.g. NNN or TAN) 2057 amino_acids.append(pos_stop) 2058 else: 2059 raise CodonTable.TranslationError( 2060 "Codon '{0}' is invalid".format(codon)) 2061 return "".join(amino_acids)
2062 2063
2064 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2065 cds=False):
2066 """Translate a nucleotide sequence into amino acids. 2067 2068 If given a string, returns a new string object. Given a Seq or 2069 MutableSeq, returns a Seq object with a protein alphabet. 2070 2071 Arguments: 2072 - table - Which codon table to use? This can be either a name (string), 2073 an NCBI identifier (integer), or a CodonTable object (useful 2074 for non-standard genetic codes). Defaults to the "Standard" 2075 table. 2076 - stop_symbol - Single character string, what to use for any 2077 terminators, defaults to the asterisk, "*". 2078 - to_stop - Boolean, defaults to False meaning do a full 2079 translation continuing on past any stop codons 2080 (translated as the specified stop_symbol). If 2081 True, translation is terminated at the first in 2082 frame stop codon (and the stop_symbol is not 2083 appended to the returned protein sequence). 2084 - cds - Boolean, indicates this is a complete CDS. If True, this 2085 checks the sequence starts with a valid alternative start 2086 codon (which will be translated as methionine, M), that the 2087 sequence length is a multiple of three, and that there is a 2088 single in frame stop codon at the end (this will be excluded 2089 from the protein sequence, regardless of the to_stop option). 2090 If these tests fail, an exception is raised. 2091 2092 A simple string example using the default (standard) genetic code: 2093 2094 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2095 >>> translate(coding_dna) 2096 'VAIVMGR*KGAR*' 2097 >>> translate(coding_dna, stop_symbol="@") 2098 'VAIVMGR@KGAR@' 2099 >>> translate(coding_dna, to_stop=True) 2100 'VAIVMGR' 2101 2102 Now using NCBI table 2, where TGA is not a stop codon: 2103 2104 >>> translate(coding_dna, table=2) 2105 'VAIVMGRWKGAR*' 2106 >>> translate(coding_dna, table=2, to_stop=True) 2107 'VAIVMGRWKGAR' 2108 2109 In fact this example uses an alternative start codon valid under NCBI table 2, 2110 GTG, which means this example is a complete valid CDS which when translated 2111 should really start with methionine (not valine): 2112 2113 >>> translate(coding_dna, table=2, cds=True) 2114 'MAIVMGRWKGAR' 2115 2116 Note that if the sequence has no in-frame stop codon, then the to_stop 2117 argument has no effect: 2118 2119 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2120 >>> translate(coding_dna2) 2121 'VAIVMGR' 2122 >>> translate(coding_dna2, to_stop=True) 2123 'VAIVMGR' 2124 2125 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2126 or a stop codon. These are translated as "X". Any invalid codon 2127 (e.g. "TA?" or "T-A") will throw a TranslationError. 2128 2129 NOTE - Does NOT support gapped sequences. 2130 2131 It will however translate either DNA or RNA. 2132 """ 2133 if isinstance(sequence, Seq): 2134 return sequence.translate(table, stop_symbol, to_stop, cds) 2135 elif isinstance(sequence, MutableSeq): 2136 # Return a Seq object 2137 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2138 else: 2139 # Assume its a string, return a string 2140 try: 2141 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2142 except ValueError: 2143 codon_table = CodonTable.ambiguous_generic_by_name[table] 2144 except (AttributeError, TypeError): 2145 if isinstance(table, CodonTable.CodonTable): 2146 codon_table = table 2147 else: 2148 raise ValueError('Bad table argument') 2149 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2150 2151
2152 -def reverse_complement(sequence):
2153 """Returns the reverse complement sequence of a nucleotide string. 2154 2155 If given a string, returns a new string object. 2156 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2157 2158 Supports unambiguous and ambiguous nucleotide sequences. 2159 2160 e.g. 2161 2162 >>> reverse_complement("ACTG-NH") 2163 'DN-CAGT' 2164 """ 2165 if isinstance(sequence, Seq): 2166 # Return a Seq 2167 return sequence.reverse_complement() 2168 elif isinstance(sequence, MutableSeq): 2169 # Return a Seq 2170 # Don't use the MutableSeq reverse_complement method as it is 'in place'. 2171 return sequence.toseq().reverse_complement() 2172 2173 # Assume its a string. 2174 # In order to avoid some code duplication, the old code would turn the string 2175 # into a Seq, use the reverse_complement method, and convert back to a string. 2176 # This worked, but is over five times slower on short sequences! 2177 if ('U' in sequence or 'u' in sequence) \ 2178 and ('T' in sequence or 't' in sequence): 2179 raise ValueError("Mixed RNA/DNA found") 2180 elif 'U' in sequence or 'u' in sequence: 2181 ttable = _rna_complement_table 2182 else: 2183 ttable = _dna_complement_table 2184 return sequence.translate(ttable)[::-1]
2185 2186
2187 -def _test():
2188 """Run the Bio.Seq module's doctests (PRIVATE).""" 2189 if sys.version_info[0:2] == (3, 1): 2190 print("Not running Bio.Seq doctest on Python 3.1") 2191 print("See http://bugs.python.org/issue7490") 2192 else: 2193 print("Running doctests...") 2194 import doctest 2195 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2196 print("Done")
2197 2198 if __name__ == "__main__": 2199 _test() 2200