Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2014 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also the Seq_ wiki and the chapter in our tutorial: 
  11      - `HTML Tutorial`_ 
  12      - `PDF Tutorial`_ 
  13   
  14  .. _Seq: http://biopython.org/wiki/Seq 
  15  .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html 
  16  .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf 
  17  """ 
  18  from __future__ import print_function 
  19   
  20  __docformat__ = "restructuredtext en"  # Don't just use plain text in epydoc API pages! 
  21   
  22  import string  # for maketrans only 
  23  import array 
  24  import sys 
  25  import warnings 
  26   
  27  from Bio._py3k import range 
  28  from Bio._py3k import basestring 
  29   
  30  from Bio import BiopythonWarning 
  31  from Bio import Alphabet 
  32  from Bio.Alphabet import IUPAC 
  33  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  34  from Bio.Data import CodonTable 
  35   
  36   
37 -def _maketrans(complement_mapping):
38 """Makes a python string translation table (PRIVATE). 39 40 Arguments: 41 - complement_mapping - a dictionary such as ambiguous_dna_complement 42 and ambiguous_rna_complement from Data.IUPACData. 43 44 Returns a translation table (a string of length 256) for use with the 45 python string's translate method to use in a (reverse) complement. 46 47 Compatible with lower case and upper case sequences. 48 49 For internal use only. 50 """ 51 before = ''.join(complement_mapping.keys()) 52 after = ''.join(complement_mapping.values()) 53 before = before + before.lower() 54 after = after + after.lower() 55 if sys.version_info[0] == 3: 56 return str.maketrans(before, after) 57 else: 58 return string.maketrans(before, after)
59 60 _dna_complement_table = _maketrans(ambiguous_dna_complement) 61 _rna_complement_table = _maketrans(ambiguous_rna_complement) 62 63
64 -class Seq(object):
65 """A read-only sequence object (essentially a string with an alphabet). 66 67 Like normal python strings, our basic sequence object is immutable. 68 This prevents you from doing my_seq[5] = "A" for example, but does allow 69 Seq objects to be used as dictionary keys. 70 71 The Seq object provides a number of string like methods (such as count, 72 find, split and strip), which are alphabet aware where appropriate. 73 74 In addition to the string like sequence, the Seq object has an alphabet 75 property. This is an instance of an Alphabet class from Bio.Alphabet, 76 for example generic DNA, or IUPAC DNA. This describes the type of molecule 77 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 78 (letters). 79 80 The Seq object also provides some biological methods, such as complement, 81 reverse_complement, transcribe, back_transcribe and translate (which are 82 not applicable to sequences with a protein alphabet). 83 """
84 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
85 """Create a Seq object. 86 87 Arguments: 88 - seq - Sequence, required (string) 89 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 90 91 You will typically use Bio.SeqIO to read in sequences from files as 92 SeqRecord objects, whose sequence will be exposed as a Seq object via 93 the seq property. 94 95 However, will often want to create your own Seq objects directly: 96 97 >>> from Bio.Seq import Seq 98 >>> from Bio.Alphabet import IUPAC 99 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 100 ... IUPAC.protein) 101 >>> my_seq 102 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 103 >>> print(my_seq) 104 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 105 >>> my_seq.alphabet 106 IUPACProtein() 107 """ 108 # Enforce string storage 109 if not isinstance(data, basestring): 110 raise TypeError("The sequence data given to a Seq object should " 111 "be a string (not another Seq object etc)") 112 self._data = data 113 self.alphabet = alphabet # Seq API requirement
114
115 - def __repr__(self):
116 """Returns a (truncated) representation of the sequence for debugging.""" 117 if len(self) > 60: 118 # Shows the last three letters as it is often useful to see if there 119 # is a stop codon at the end of a sequence. 120 # Note total length is 54+3+3=60 121 return "%s('%s...%s', %s)" % (self.__class__.__name__, 122 str(self)[:54], str(self)[-3:], 123 repr(self.alphabet)) 124 else: 125 return "%s(%s, %s)" % (self.__class__.__name__, 126 repr(self._data), 127 repr(self.alphabet))
128
129 - def __str__(self):
130 """Returns the full sequence as a python string, use str(my_seq). 131 132 Note that Biopython 1.44 and earlier would give a truncated 133 version of repr(my_seq) for str(my_seq). If you are writing code 134 which need to be backwards compatible with old Biopython, you 135 should continue to use my_seq.tostring() rather than str(my_seq). 136 """ 137 return self._data
138
139 - def __hash__(self):
140 """Hash for comparison. 141 142 See the __cmp__ documentation - this has changed from past 143 versions of Biopython! 144 """ 145 # TODO - remove this warning in a future release 146 warnings.warn("Biopython Seq objects now use string comparison. " 147 "Older versions of Biopython used object comparison. " 148 "During this transition, please use hash(id(my_seq)) " 149 "or my_dict[id(my_seq)] if you want the old behaviour, " 150 "or use hash(str(my_seq)) or my_dict[str(my_seq)] for " 151 "the new string hashing behaviour.", BiopythonWarning) 152 return hash(str(self))
153
154 - def __eq__(self, other):
155 """Compare the sequence to another sequence or a string (README). 156 157 Historically comparing Seq objects has done Python object comparison. 158 After considerable discussion (keeping in mind constraints of the 159 Python language, hashes and dictionary support), Biopython now uses 160 simple string comparison (with a warning about the change). 161 162 Note that incompatible alphabets (e.g. DNA to RNA) will trigger a 163 warning. 164 165 During this transition period, please just do explicit comparisons: 166 167 >>> from Bio.Seq import Seq 168 >>> from Bio.Alphabet import generic_dna 169 >>> seq1 = Seq("ACGT") 170 >>> seq2 = Seq("ACGT") 171 >>> id(seq1) == id(seq2) 172 False 173 >>> str(seq1) == str(seq2) 174 True 175 176 The new behaviour is to use string-like equality: 177 178 >>> from Bio.Seq import Seq 179 >>> from Bio.Alphabet import generic_dna 180 >>> seq1 == seq2 181 True 182 >>> seq1 == "ACGT" 183 True 184 >>> seq1 == Seq("ACGT", generic_dna) 185 True 186 187 """ 188 if hasattr(other, "alphabet"): 189 #other could be a Seq or a MutableSeq 190 if not Alphabet._check_type_compatible([self.alphabet, 191 other.alphabet]): 192 warnings.warn("Incompatible alphabets %s and %s" 193 % (repr(self.alphabet), repr(other.alphabet)), 194 BiopythonWarning) 195 return str(self) == str(other)
196
197 - def __ne__(self, other):
198 """Not equal, see __eq__ documentation.""" 199 # Seem to require this method under Python 2 but not needed on Python 3? 200 return not (self == other)
201
202 - def __lt__(self, other):
203 """Less than, see __eq__ documentation.""" 204 if hasattr(other, "alphabet"): 205 if not Alphabet._check_type_compatible([self.alphabet, 206 other.alphabet]): 207 warnings.warn("Incompatible alphabets %s and %s" 208 % (repr(self.alphabet), repr(other.alphabet)), 209 BiopythonWarning) 210 return str(self) < str(other)
211
212 - def __le__(self, other):
213 """Less than or equal, see __eq__ documentation.""" 214 if hasattr(other, "alphabet"): 215 if not Alphabet._check_type_compatible([self.alphabet, 216 other.alphabet]): 217 warnings.warn("Incompatible alphabets %s and %s" 218 % (repr(self.alphabet), repr(other.alphabet)), 219 BiopythonWarning) 220 return str(self) <= str(other)
221
222 - def __len__(self):
223 """Returns the length of the sequence, use len(my_seq).""" 224 return len(self._data) # Seq API requirement
225
226 - def __getitem__(self, index): # Seq API requirement
227 """Returns a subsequence of single letter, use my_seq[index].""" 228 # Note since Python 2.0, __getslice__ is deprecated 229 # and __getitem__ is used instead. 230 # See http://docs.python.org/ref/sequence-methods.html 231 if isinstance(index, int): 232 # Return a single letter as a string 233 return self._data[index] 234 else: 235 # Return the (sub)sequence as another Seq object 236 return Seq(self._data[index], self.alphabet)
237
238 - def __add__(self, other):
239 """Add another sequence or string to this sequence. 240 241 If adding a string to a Seq, the alphabet is preserved: 242 243 >>> from Bio.Seq import Seq 244 >>> from Bio.Alphabet import generic_protein 245 >>> Seq("MELKI", generic_protein) + "LV" 246 Seq('MELKILV', ProteinAlphabet()) 247 248 When adding two Seq (like) objects, the alphabets are important. 249 Consider this example: 250 251 >>> from Bio.Seq import Seq 252 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 253 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 254 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 255 >>> unamb_dna_seq 256 Seq('ACGT', IUPACUnambiguousDNA()) 257 >>> ambig_dna_seq 258 Seq('ACRGT', IUPACAmbiguousDNA()) 259 260 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 261 the more general ambiguous IUPAC DNA alphabet: 262 263 >>> unamb_dna_seq + ambig_dna_seq 264 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 265 266 However, if the default generic alphabet is included, the result is 267 a generic alphabet: 268 269 >>> Seq("") + ambig_dna_seq 270 Seq('ACRGT', Alphabet()) 271 272 You can't add RNA and DNA sequences: 273 274 >>> from Bio.Alphabet import generic_dna, generic_rna 275 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 276 Traceback (most recent call last): 277 ... 278 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 279 280 You can't add nucleotide and protein sequences: 281 282 >>> from Bio.Alphabet import generic_dna, generic_protein 283 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 284 Traceback (most recent call last): 285 ... 286 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 287 """ 288 if hasattr(other, "alphabet"): 289 # other should be a Seq or a MutableSeq 290 if not Alphabet._check_type_compatible([self.alphabet, 291 other.alphabet]): 292 raise TypeError("Incompatible alphabets %s and %s" 293 % (repr(self.alphabet), repr(other.alphabet))) 294 # They should be the same sequence type (or one of them is generic) 295 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 296 return self.__class__(str(self) + str(other), a) 297 elif isinstance(other, basestring): 298 # other is a plain string - use the current alphabet 299 return self.__class__(str(self) + other, self.alphabet) 300 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 301 if isinstance(other, SeqRecord): 302 # Get the SeqRecord's __radd__ to handle this 303 return NotImplemented 304 else: 305 raise TypeError
306
307 - def __radd__(self, other):
308 """Adding a sequence on the left. 309 310 If adding a string to a Seq, the alphabet is preserved: 311 312 >>> from Bio.Seq import Seq 313 >>> from Bio.Alphabet import generic_protein 314 >>> "LV" + Seq("MELKI", generic_protein) 315 Seq('LVMELKI', ProteinAlphabet()) 316 317 Adding two Seq (like) objects is handled via the __add__ method. 318 """ 319 if hasattr(other, "alphabet"): 320 # other should be a Seq or a MutableSeq 321 if not Alphabet._check_type_compatible([self.alphabet, 322 other.alphabet]): 323 raise TypeError("Incompatible alphabets %s and %s" 324 % (repr(self.alphabet), repr(other.alphabet))) 325 # They should be the same sequence type (or one of them is generic) 326 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 327 return self.__class__(str(other) + str(self), a) 328 elif isinstance(other, basestring): 329 # other is a plain string - use the current alphabet 330 return self.__class__(other + str(self), self.alphabet) 331 else: 332 raise TypeError
333
334 - def tostring(self): # Seq API requirement
335 """Returns the full sequence as a python string (DEPRECATED). 336 337 You are now encouraged to use str(my_seq) instead of 338 my_seq.tostring().""" 339 from Bio import BiopythonDeprecationWarning 340 warnings.warn("This method is obsolete; please use str(my_seq) " 341 "instead of my_seq.tostring().", 342 BiopythonDeprecationWarning) 343 return str(self) 344
345 - def tomutable(self): # Needed? Or use a function?
346 """Returns the full sequence as a MutableSeq object. 347 348 >>> from Bio.Seq import Seq 349 >>> from Bio.Alphabet import IUPAC 350 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 351 ... IUPAC.protein) 352 >>> my_seq 353 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 354 >>> my_seq.tomutable() 355 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 356 357 Note that the alphabet is preserved. 358 """ 359 return MutableSeq(str(self), self.alphabet) 360
361 - def _get_seq_str_and_check_alphabet(self, other_sequence):
362 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 363 364 For a string argument, returns the string. 365 366 For a Seq or MutableSeq, it checks the alphabet is compatible 367 (raising an exception if it isn't), and then returns a string. 368 """ 369 try: 370 other_alpha = other_sequence.alphabet 371 except AttributeError: 372 # Assume other_sequence is a string 373 return other_sequence 374 375 # Other should be a Seq or a MutableSeq 376 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 377 raise TypeError("Incompatible alphabets %s and %s" 378 % (repr(self.alphabet), repr(other_alpha))) 379 # Return as a string 380 return str(other_sequence)
381
382 - def count(self, sub, start=0, end=sys.maxsize):
383 """Non-overlapping count method, like that of a python string. 384 385 This behaves like the python string method of the same name, 386 which does a non-overlapping count! 387 388 Returns an integer, the number of occurrences of substring 389 argument sub in the (sub)sequence given by [start:end]. 390 Optional arguments start and end are interpreted as in slice 391 notation. 392 393 Arguments: 394 - sub - a string or another Seq object to look for 395 - start - optional integer, slice start 396 - end - optional integer, slice end 397 398 e.g. 399 400 >>> from Bio.Seq import Seq 401 >>> my_seq = Seq("AAAATGA") 402 >>> print(my_seq.count("A")) 403 5 404 >>> print(my_seq.count("ATG")) 405 1 406 >>> print(my_seq.count(Seq("AT"))) 407 1 408 >>> print(my_seq.count("AT", 2, -1)) 409 1 410 411 HOWEVER, please note because python strings and Seq objects (and 412 MutableSeq objects) do a non-overlapping search, this may not give 413 the answer you expect: 414 415 >>> "AAAA".count("AA") 416 2 417 >>> print(Seq("AAAA").count("AA")) 418 2 419 420 An overlapping search would give the answer as three! 421 """ 422 # If it has one, check the alphabet: 423 sub_str = self._get_seq_str_and_check_alphabet(sub) 424 return str(self).count(sub_str, start, end)
425
426 - def __contains__(self, char):
427 """Implements the 'in' keyword, like a python string. 428 429 e.g. 430 431 >>> from Bio.Seq import Seq 432 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 433 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 434 >>> "AAA" in my_dna 435 True 436 >>> Seq("AAA") in my_dna 437 True 438 >>> Seq("AAA", generic_dna) in my_dna 439 True 440 441 Like other Seq methods, this will raise a type error if another Seq 442 (or Seq like) object with an incompatible alphabet is used: 443 444 >>> Seq("AAA", generic_rna) in my_dna 445 Traceback (most recent call last): 446 ... 447 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 448 >>> Seq("AAA", generic_protein) in my_dna 449 Traceback (most recent call last): 450 ... 451 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 452 """ 453 # If it has one, check the alphabet: 454 sub_str = self._get_seq_str_and_check_alphabet(char) 455 return sub_str in str(self)
456
457 - def find(self, sub, start=0, end=sys.maxsize):
458 """Find method, like that of a python string. 459 460 This behaves like the python string method of the same name. 461 462 Returns an integer, the index of the first occurrence of substring 463 argument sub in the (sub)sequence given by [start:end]. 464 465 Arguments: 466 - sub - a string or another Seq object to look for 467 - start - optional integer, slice start 468 - end - optional integer, slice end 469 470 Returns -1 if the subsequence is NOT found. 471 472 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 473 474 >>> from Bio.Seq import Seq 475 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 476 >>> my_rna.find("AUG") 477 3 478 """ 479 # If it has one, check the alphabet: 480 sub_str = self._get_seq_str_and_check_alphabet(sub) 481 return str(self).find(sub_str, start, end)
482
483 - def rfind(self, sub, start=0, end=sys.maxsize):
484 """Find from right method, like that of a python string. 485 486 This behaves like the python string method of the same name. 487 488 Returns an integer, the index of the last (right most) occurrence of 489 substring argument sub in the (sub)sequence given by [start:end]. 490 491 Arguments: 492 - sub - a string or another Seq object to look for 493 - start - optional integer, slice start 494 - end - optional integer, slice end 495 496 Returns -1 if the subsequence is NOT found. 497 498 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 499 500 >>> from Bio.Seq import Seq 501 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 502 >>> my_rna.rfind("AUG") 503 15 504 """ 505 # If it has one, check the alphabet: 506 sub_str = self._get_seq_str_and_check_alphabet(sub) 507 return str(self).rfind(sub_str, start, end)
508
509 - def startswith(self, prefix, start=0, end=sys.maxsize):
510 """Does the Seq start with the given prefix? Returns True/False. 511 512 This behaves like the python string method of the same name. 513 514 Return True if the sequence starts with the specified prefix 515 (a string or another Seq object), False otherwise. 516 With optional start, test sequence beginning at that position. 517 With optional end, stop comparing sequence at that position. 518 prefix can also be a tuple of strings to try. e.g. 519 520 >>> from Bio.Seq import Seq 521 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 522 >>> my_rna.startswith("GUC") 523 True 524 >>> my_rna.startswith("AUG") 525 False 526 >>> my_rna.startswith("AUG", 3) 527 True 528 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 529 True 530 """ 531 # If it has one, check the alphabet: 532 if isinstance(prefix, tuple): 533 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 534 for p in prefix) 535 return str(self).startswith(prefix_strs, start, end) 536 else: 537 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 538 return str(self).startswith(prefix_str, start, end)
539
540 - def endswith(self, suffix, start=0, end=sys.maxsize):
541 """Does the Seq end with the given suffix? Returns True/False. 542 543 This behaves like the python string method of the same name. 544 545 Return True if the sequence ends with the specified suffix 546 (a string or another Seq object), False otherwise. 547 With optional start, test sequence beginning at that position. 548 With optional end, stop comparing sequence at that position. 549 suffix can also be a tuple of strings to try. e.g. 550 551 >>> from Bio.Seq import Seq 552 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 553 >>> my_rna.endswith("UUG") 554 True 555 >>> my_rna.endswith("AUG") 556 False 557 >>> my_rna.endswith("AUG", 0, 18) 558 True 559 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 560 True 561 """ 562 # If it has one, check the alphabet: 563 if isinstance(suffix, tuple): 564 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 565 for p in suffix) 566 return str(self).endswith(suffix_strs, start, end) 567 else: 568 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 569 return str(self).endswith(suffix_str, start, end)
570
571 - def split(self, sep=None, maxsplit=-1):
572 """Split method, like that of a python string. 573 574 This behaves like the python string method of the same name. 575 576 Return a list of the 'words' in the string (as Seq objects), 577 using sep as the delimiter string. If maxsplit is given, at 578 most maxsplit splits are done. If maxsplit is omitted, all 579 splits are made. 580 581 Following the python string method, sep will by default be any 582 white space (tabs, spaces, newlines) but this is unlikely to 583 apply to biological sequences. 584 585 e.g. 586 587 >>> from Bio.Seq import Seq 588 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 589 >>> my_aa = my_rna.translate() 590 >>> my_aa 591 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 592 >>> my_aa.split("*") 593 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 594 >>> my_aa.split("*", 1) 595 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 596 597 See also the rsplit method: 598 599 >>> my_aa.rsplit("*", 1) 600 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 601 """ 602 # If it has one, check the alphabet: 603 sep_str = self._get_seq_str_and_check_alphabet(sep) 604 # TODO - If the sep is the defined stop symbol, or gap char, 605 # should we adjust the alphabet? 606 return [Seq(part, self.alphabet) 607 for part in str(self).split(sep_str, maxsplit)]
608
609 - def rsplit(self, sep=None, maxsplit=-1):
610 """Right split method, like that of a python string. 611 612 This behaves like the python string method of the same name. 613 614 Return a list of the 'words' in the string (as Seq objects), 615 using sep as the delimiter string. If maxsplit is given, at 616 most maxsplit splits are done COUNTING FROM THE RIGHT. 617 If maxsplit is omitted, all splits are made. 618 619 Following the python string method, sep will by default be any 620 white space (tabs, spaces, newlines) but this is unlikely to 621 apply to biological sequences. 622 623 e.g. print(my_seq.rsplit("*",1)) 624 625 See also the split method. 626 """ 627 # If it has one, check the alphabet: 628 sep_str = self._get_seq_str_and_check_alphabet(sep) 629 return [Seq(part, self.alphabet) 630 for part in str(self).rsplit(sep_str, maxsplit)]
631
632 - def strip(self, chars=None):
633 """Returns a new Seq object with leading and trailing ends stripped. 634 635 This behaves like the python string method of the same name. 636 637 Optional argument chars defines which characters to remove. If 638 omitted or None (default) then as for the python string method, 639 this defaults to removing any white space. 640 641 e.g. print(my_seq.strip("-")) 642 643 See also the lstrip and rstrip methods. 644 """ 645 # If it has one, check the alphabet: 646 strip_str = self._get_seq_str_and_check_alphabet(chars) 647 return Seq(str(self).strip(strip_str), self.alphabet)
648
649 - def lstrip(self, chars=None):
650 """Returns a new Seq object with leading (left) end stripped. 651 652 This behaves like the python string method of the same name. 653 654 Optional argument chars defines which characters to remove. If 655 omitted or None (default) then as for the python string method, 656 this defaults to removing any white space. 657 658 e.g. print(my_seq.lstrip("-")) 659 660 See also the strip and rstrip methods. 661 """ 662 # If it has one, check the alphabet: 663 strip_str = self._get_seq_str_and_check_alphabet(chars) 664 return Seq(str(self).lstrip(strip_str), self.alphabet)
665
666 - def rstrip(self, chars=None):
667 """Returns a new Seq object with trailing (right) end stripped. 668 669 This behaves like the python string method of the same name. 670 671 Optional argument chars defines which characters to remove. If 672 omitted or None (default) then as for the python string method, 673 this defaults to removing any white space. 674 675 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 676 677 >>> from Bio.Alphabet import IUPAC 678 >>> from Bio.Seq import Seq 679 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 680 >>> my_seq 681 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 682 >>> my_seq.rstrip("A") 683 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 684 685 See also the strip and lstrip methods. 686 """ 687 # If it has one, check the alphabet: 688 strip_str = self._get_seq_str_and_check_alphabet(chars) 689 return Seq(str(self).rstrip(strip_str), self.alphabet)
690
691 - def upper(self):
692 """Returns an upper case copy of the sequence. 693 694 >>> from Bio.Alphabet import HasStopCodon, generic_protein 695 >>> from Bio.Seq import Seq 696 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 697 >>> my_seq 698 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 699 >>> my_seq.lower() 700 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 701 >>> my_seq.upper() 702 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 703 704 This will adjust the alphabet if required. See also the lower method. 705 """ 706 return Seq(str(self).upper(), self.alphabet._upper())
707
708 - def lower(self):
709 """Returns a lower case copy of the sequence. 710 711 This will adjust the alphabet if required. Note that the IUPAC alphabets 712 are upper case only, and thus a generic alphabet must be substituted. 713 714 >>> from Bio.Alphabet import Gapped, generic_dna 715 >>> from Bio.Alphabet import IUPAC 716 >>> from Bio.Seq import Seq 717 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 718 >>> my_seq 719 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 720 >>> my_seq.lower() 721 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 722 723 See also the upper method. 724 """ 725 return Seq(str(self).lower(), self.alphabet._lower())
726
727 - def complement(self):
728 """Returns the complement sequence. New Seq object. 729 730 >>> from Bio.Seq import Seq 731 >>> from Bio.Alphabet import IUPAC 732 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 733 >>> my_dna 734 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 735 >>> my_dna.complement() 736 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 737 738 You can of course used mixed case sequences, 739 740 >>> from Bio.Seq import Seq 741 >>> from Bio.Alphabet import generic_dna 742 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 743 >>> my_dna 744 Seq('CCCCCgatA-GD', DNAAlphabet()) 745 >>> my_dna.complement() 746 Seq('GGGGGctaT-CH', DNAAlphabet()) 747 748 Note in the above example, ambiguous character D denotes 749 G, A or T so its complement is H (for C, T or A). 750 751 Trying to complement a protein sequence raises an exception. 752 753 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 754 >>> my_protein.complement() 755 Traceback (most recent call last): 756 ... 757 ValueError: Proteins do not have complements! 758 """ 759 base = Alphabet._get_base_alphabet(self.alphabet) 760 if isinstance(base, Alphabet.ProteinAlphabet): 761 raise ValueError("Proteins do not have complements!") 762 if isinstance(base, Alphabet.DNAAlphabet): 763 ttable = _dna_complement_table 764 elif isinstance(base, Alphabet.RNAAlphabet): 765 ttable = _rna_complement_table 766 elif ('U' in self._data or 'u' in self._data) \ 767 and ('T' in self._data or 't' in self._data): 768 # TODO - Handle this cleanly? 769 raise ValueError("Mixed RNA/DNA found") 770 elif 'U' in self._data or 'u' in self._data: 771 ttable = _rna_complement_table 772 else: 773 ttable = _dna_complement_table 774 # Much faster on really long sequences than the previous loop based one. 775 # thx to Michael Palmer, University of Waterloo 776 return Seq(str(self).translate(ttable), self.alphabet)
777
778 - def reverse_complement(self):
779 """Returns the reverse complement sequence. New Seq object. 780 781 >>> from Bio.Seq import Seq 782 >>> from Bio.Alphabet import IUPAC 783 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 784 >>> my_dna 785 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 786 >>> my_dna.reverse_complement() 787 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 788 789 Note in the above example, since R = G or A, its complement 790 is Y (which denotes C or T). 791 792 You can of course used mixed case sequences, 793 794 >>> from Bio.Seq import Seq 795 >>> from Bio.Alphabet import generic_dna 796 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 797 >>> my_dna 798 Seq('CCCCCgatA-G', DNAAlphabet()) 799 >>> my_dna.reverse_complement() 800 Seq('C-TatcGGGGG', DNAAlphabet()) 801 802 Trying to complement a protein sequence raises an exception: 803 804 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 805 >>> my_protein.reverse_complement() 806 Traceback (most recent call last): 807 ... 808 ValueError: Proteins do not have complements! 809 """ 810 # Use -1 stride/step to reverse the complement 811 return self.complement()[::-1]
812
813 - def transcribe(self):
814 """Returns the RNA sequence from a DNA sequence. New Seq object. 815 816 >>> from Bio.Seq import Seq 817 >>> from Bio.Alphabet import IUPAC 818 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 819 ... IUPAC.unambiguous_dna) 820 >>> coding_dna 821 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 822 >>> coding_dna.transcribe() 823 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 824 825 Trying to transcribe a protein or RNA sequence raises an exception: 826 827 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 828 >>> my_protein.transcribe() 829 Traceback (most recent call last): 830 ... 831 ValueError: Proteins cannot be transcribed! 832 """ 833 base = Alphabet._get_base_alphabet(self.alphabet) 834 if isinstance(base, Alphabet.ProteinAlphabet): 835 raise ValueError("Proteins cannot be transcribed!") 836 if isinstance(base, Alphabet.RNAAlphabet): 837 raise ValueError("RNA cannot be transcribed!") 838 839 if self.alphabet==IUPAC.unambiguous_dna: 840 alphabet = IUPAC.unambiguous_rna 841 elif self.alphabet==IUPAC.ambiguous_dna: 842 alphabet = IUPAC.ambiguous_rna 843 else: 844 alphabet = Alphabet.generic_rna 845 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
846
847 - def back_transcribe(self):
848 """Returns the DNA sequence from an RNA sequence. New Seq object. 849 850 >>> from Bio.Seq import Seq 851 >>> from Bio.Alphabet import IUPAC 852 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 853 ... IUPAC.unambiguous_rna) 854 >>> messenger_rna 855 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 856 >>> messenger_rna.back_transcribe() 857 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 858 859 Trying to back-transcribe a protein or DNA sequence raises an 860 exception: 861 862 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 863 >>> my_protein.back_transcribe() 864 Traceback (most recent call last): 865 ... 866 ValueError: Proteins cannot be back transcribed! 867 """ 868 base = Alphabet._get_base_alphabet(self.alphabet) 869 if isinstance(base, Alphabet.ProteinAlphabet): 870 raise ValueError("Proteins cannot be back transcribed!") 871 if isinstance(base, Alphabet.DNAAlphabet): 872 raise ValueError("DNA cannot be back transcribed!") 873 874 if self.alphabet==IUPAC.unambiguous_rna: 875 alphabet = IUPAC.unambiguous_dna 876 elif self.alphabet==IUPAC.ambiguous_rna: 877 alphabet = IUPAC.ambiguous_dna 878 else: 879 alphabet = Alphabet.generic_dna 880 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
881
882 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 883 cds=False):
884 """Turns a nucleotide sequence into a protein sequence. New Seq object. 885 886 This method will translate DNA or RNA sequences, and those with a 887 nucleotide or generic alphabet. Trying to translate a protein 888 sequence raises an exception. 889 890 Arguments: 891 - table - Which codon table to use? This can be either a name 892 (string), an NCBI identifier (integer), or a CodonTable 893 object (useful for non-standard genetic codes). This 894 defaults to the "Standard" table. 895 - stop_symbol - Single character string, what to use for terminators. 896 This defaults to the asterisk, "*". 897 - to_stop - Boolean, defaults to False meaning do a full translation 898 continuing on past any stop codons (translated as the 899 specified stop_symbol). If True, translation is 900 terminated at the first in frame stop codon (and the 901 stop_symbol is not appended to the returned protein 902 sequence). 903 - cds - Boolean, indicates this is a complete CDS. If True, 904 this checks the sequence starts with a valid alternative start 905 codon (which will be translated as methionine, M), that the 906 sequence length is a multiple of three, and that there is a 907 single in frame stop codon at the end (this will be excluded 908 from the protein sequence, regardless of the to_stop option). 909 If these tests fail, an exception is raised. 910 911 e.g. Using the standard table: 912 913 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 914 >>> coding_dna.translate() 915 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 916 >>> coding_dna.translate(stop_symbol="@") 917 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 918 >>> coding_dna.translate(to_stop=True) 919 Seq('VAIVMGR', ExtendedIUPACProtein()) 920 921 Now using NCBI table 2, where TGA is not a stop codon: 922 923 >>> coding_dna.translate(table=2) 924 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 925 >>> coding_dna.translate(table=2, to_stop=True) 926 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 927 928 In fact, GTG is an alternative start codon under NCBI table 2, meaning 929 this sequence could be a complete CDS: 930 931 >>> coding_dna.translate(table=2, cds=True) 932 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 933 934 It isn't a valid CDS under NCBI table 1, due to both the start codon and 935 also the in frame stop codons: 936 937 >>> coding_dna.translate(table=1, cds=True) 938 Traceback (most recent call last): 939 ... 940 TranslationError: First codon 'GTG' is not a start codon 941 942 If the sequence has no in-frame stop codon, then the to_stop argument 943 has no effect: 944 945 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 946 >>> coding_dna2.translate() 947 Seq('LAIVMGR', ExtendedIUPACProtein()) 948 >>> coding_dna2.translate(to_stop=True) 949 Seq('LAIVMGR', ExtendedIUPACProtein()) 950 951 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 952 or a stop codon. These are translated as "X". Any invalid codon 953 (e.g. "TA?" or "T-A") will throw a TranslationError. 954 955 NOTE - Does NOT support gapped sequences. 956 957 NOTE - This does NOT behave like the python string's translate 958 method. For that use str(my_seq).translate(...) instead. 959 """ 960 if isinstance(table, str) and len(table)==256: 961 raise ValueError("The Seq object translate method DOES NOT take " 962 + "a 256 character string mapping table like " 963 + "the python string object's translate method. " 964 + "Use str(my_seq).translate(...) instead.") 965 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 966 Alphabet.ProteinAlphabet): 967 raise ValueError("Proteins cannot be translated!") 968 try: 969 table_id = int(table) 970 except ValueError: 971 # Assume its a table name 972 if self.alphabet==IUPAC.unambiguous_dna: 973 # Will use standard IUPAC protein alphabet, no need for X 974 codon_table = CodonTable.unambiguous_dna_by_name[table] 975 elif self.alphabet==IUPAC.unambiguous_rna: 976 # Will use standard IUPAC protein alphabet, no need for X 977 codon_table = CodonTable.unambiguous_rna_by_name[table] 978 else: 979 # This will use the extended IUPAC protein alphabet with X etc. 980 # The same table can be used for RNA or DNA (we use this for 981 # translating strings). 982 codon_table = CodonTable.ambiguous_generic_by_name[table] 983 except (AttributeError, TypeError): 984 # Assume its a CodonTable object 985 if isinstance(table, CodonTable.CodonTable): 986 codon_table = table 987 else: 988 raise ValueError('Bad table argument') 989 else: 990 # Assume its a table ID 991 if self.alphabet==IUPAC.unambiguous_dna: 992 # Will use standard IUPAC protein alphabet, no need for X 993 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 994 elif self.alphabet==IUPAC.unambiguous_rna: 995 # Will use standard IUPAC protein alphabet, no need for X 996 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 997 else: 998 # This will use the extended IUPAC protein alphabet with X etc. 999 # The same table can be used for RNA or DNA (we use this for 1000 # translating strings). 1001 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 1002 protein = _translate_str(str(self), codon_table, 1003 stop_symbol, to_stop, cds) 1004 if stop_symbol in protein: 1005 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 1006 stop_symbol=stop_symbol) 1007 else: 1008 alphabet = codon_table.protein_alphabet 1009 return Seq(protein, alphabet)
1010
1011 - def ungap(self, gap=None):
1012 """Return a copy of the sequence without the gap character(s). 1013 1014 The gap character can be specified in two ways - either as an explicit 1015 argument, or via the sequence's alphabet. For example: 1016 1017 >>> from Bio.Seq import Seq 1018 >>> from Bio.Alphabet import generic_dna 1019 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1020 >>> my_dna 1021 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1022 >>> my_dna.ungap("-") 1023 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1024 1025 If the gap character is not given as an argument, it will be taken from 1026 the sequence's alphabet (if defined). Notice that the returned sequence's 1027 alphabet is adjusted since it no longer requires a gapped alphabet: 1028 1029 >>> from Bio.Seq import Seq 1030 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1031 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1032 >>> my_pro 1033 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1034 >>> my_pro.ungap() 1035 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1036 1037 Or, with a simpler gapped DNA example: 1038 1039 >>> from Bio.Seq import Seq 1040 >>> from Bio.Alphabet import IUPAC, Gapped 1041 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1042 >>> my_seq 1043 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1044 >>> my_seq.ungap() 1045 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1046 1047 As long as it is consistent with the alphabet, although it is redundant, 1048 you can still supply the gap character as an argument to this method: 1049 1050 >>> my_seq 1051 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1052 >>> my_seq.ungap("=") 1053 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1054 1055 However, if the gap character given as the argument disagrees with that 1056 declared in the alphabet, an exception is raised: 1057 1058 >>> my_seq 1059 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1060 >>> my_seq.ungap("-") 1061 Traceback (most recent call last): 1062 ... 1063 ValueError: Gap '-' does not match '=' from alphabet 1064 1065 Finally, if a gap character is not supplied, and the alphabet does not 1066 define one, an exception is raised: 1067 1068 >>> from Bio.Seq import Seq 1069 >>> from Bio.Alphabet import generic_dna 1070 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1071 >>> my_dna 1072 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1073 >>> my_dna.ungap() 1074 Traceback (most recent call last): 1075 ... 1076 ValueError: Gap character not given and not defined in alphabet 1077 1078 """ 1079 if hasattr(self.alphabet, "gap_char"): 1080 if not gap: 1081 gap = self.alphabet.gap_char 1082 elif gap != self.alphabet.gap_char: 1083 raise ValueError("Gap %s does not match %s from alphabet" 1084 % (repr(gap), repr(self.alphabet.gap_char))) 1085 alpha = Alphabet._ungap(self.alphabet) 1086 elif not gap: 1087 raise ValueError("Gap character not given and not defined in alphabet") 1088 else: 1089 alpha = self.alphabet # modify! 1090 if len(gap)!=1 or not isinstance(gap, str): 1091 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1092 return Seq(str(self).replace(gap, ""), alpha)
1093 1094
1095 -class UnknownSeq(Seq):
1096 """A read-only sequence object of known length but unknown contents. 1097 1098 If you have an unknown sequence, you can represent this with a normal 1099 Seq object, for example: 1100 1101 >>> my_seq = Seq("N"*5) 1102 >>> my_seq 1103 Seq('NNNNN', Alphabet()) 1104 >>> len(my_seq) 1105 5 1106 >>> print(my_seq) 1107 NNNNN 1108 1109 However, this is rather wasteful of memory (especially for large 1110 sequences), which is where this class is most usefull: 1111 1112 >>> unk_five = UnknownSeq(5) 1113 >>> unk_five 1114 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1115 >>> len(unk_five) 1116 5 1117 >>> print(unk_five) 1118 ????? 1119 1120 You can add unknown sequence together, provided their alphabets and 1121 characters are compatible, and get another memory saving UnknownSeq: 1122 1123 >>> unk_four = UnknownSeq(4) 1124 >>> unk_four 1125 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1126 >>> unk_four + unk_five 1127 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1128 1129 If the alphabet or characters don't match up, the addition gives an 1130 ordinary Seq object: 1131 1132 >>> unk_nnnn = UnknownSeq(4, character = "N") 1133 >>> unk_nnnn 1134 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1135 >>> unk_nnnn + unk_four 1136 Seq('NNNN????', Alphabet()) 1137 1138 Combining with a real Seq gives a new Seq object: 1139 1140 >>> known_seq = Seq("ACGT") 1141 >>> unk_four + known_seq 1142 Seq('????ACGT', Alphabet()) 1143 >>> known_seq + unk_four 1144 Seq('ACGT????', Alphabet()) 1145 """
1146 - def __init__(self, length, alphabet=Alphabet.generic_alphabet, character=None):
1147 """Create a new UnknownSeq object. 1148 1149 If character is omitted, it is determined from the alphabet, "N" for 1150 nucleotides, "X" for proteins, and "?" otherwise. 1151 """ 1152 self._length = int(length) 1153 if self._length < 0: 1154 # TODO - Block zero length UnknownSeq? You can just use a Seq! 1155 raise ValueError("Length must not be negative.") 1156 self.alphabet = alphabet 1157 if character: 1158 if len(character) != 1: 1159 raise ValueError("character argument should be a single letter string.") 1160 self._character = character 1161 else: 1162 base = Alphabet._get_base_alphabet(alphabet) 1163 # TODO? Check the case of the letters in the alphabet? 1164 # We may have to use "n" instead of "N" etc. 1165 if isinstance(base, Alphabet.NucleotideAlphabet): 1166 self._character = "N" 1167 elif isinstance(base, Alphabet.ProteinAlphabet): 1168 self._character = "X" 1169 else: 1170 self._character = "?"
1171
1172 - def __len__(self):
1173 """Returns the stated length of the unknown sequence.""" 1174 return self._length
1175
1176 - def __str__(self):
1177 """Returns the unknown sequence as full string of the given length.""" 1178 return self._character * self._length
1179
1180 - def __repr__(self):
1181 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1182 % (self._length, repr(self.alphabet), repr(self._character))
1183
1184 - def __add__(self, other):
1185 """Add another sequence or string to this sequence. 1186 1187 Adding two UnknownSeq objects returns another UnknownSeq object 1188 provided the character is the same and the alphabets are compatible. 1189 1190 >>> from Bio.Seq import UnknownSeq 1191 >>> from Bio.Alphabet import generic_protein 1192 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1193 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1194 1195 If the characters differ, an UnknownSeq object cannot be used, so a 1196 Seq object is returned: 1197 1198 >>> from Bio.Seq import UnknownSeq 1199 >>> from Bio.Alphabet import generic_protein 1200 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1201 ... character="x") 1202 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1203 1204 If adding a string to an UnknownSeq, a new Seq is returned with the 1205 same alphabet: 1206 1207 >>> from Bio.Seq import UnknownSeq 1208 >>> from Bio.Alphabet import generic_protein 1209 >>> UnknownSeq(5, generic_protein) + "LV" 1210 Seq('XXXXXLV', ProteinAlphabet()) 1211 """ 1212 if isinstance(other, UnknownSeq) \ 1213 and other._character == self._character: 1214 # TODO - Check the alphabets match 1215 return UnknownSeq(len(self)+len(other), 1216 self.alphabet, self._character) 1217 # Offload to the base class... 1218 return Seq(str(self), self.alphabet) + other
1219
1220 - def __radd__(self, other):
1221 # If other is an UnknownSeq, then __add__ would be called. 1222 # Offload to the base class... 1223 return other + Seq(str(self), self.alphabet)
1224
1225 - def __getitem__(self, index):
1226 """Get a subsequence from the UnknownSeq object. 1227 1228 >>> unk = UnknownSeq(8, character="N") 1229 >>> print(unk[:]) 1230 NNNNNNNN 1231 >>> print(unk[5:3]) 1232 <BLANKLINE> 1233 >>> print(unk[1:-1]) 1234 NNNNNN 1235 >>> print(unk[1:-1:2]) 1236 NNN 1237 """ 1238 if isinstance(index, int): 1239 # TODO - Check the bounds without wasting memory 1240 return str(self)[index] 1241 old_length = self._length 1242 step = index.step 1243 if step is None or step == 1: 1244 # This calculates the length you'd get from ("N"*old_length)[index] 1245 start = index.start 1246 end = index.stop 1247 if start is None: 1248 start = 0 1249 elif start < 0: 1250 start = max(0, old_length + start) 1251 elif start > old_length: 1252 start = old_length 1253 if end is None: 1254 end = old_length 1255 elif end < 0: 1256 end = max(0, old_length + end) 1257 elif end > old_length: 1258 end = old_length 1259 new_length = max(0, end-start) 1260 elif step == 0: 1261 raise ValueError("slice step cannot be zero") 1262 else: 1263 # TODO - handle step efficiently 1264 new_length = len(("X"*old_length)[index]) 1265 # assert new_length == len(("X"*old_length)[index]), \ 1266 # (index, start, end, step, old_length, 1267 # new_length, len(("X"*old_length)[index])) 1268 return UnknownSeq(new_length, self.alphabet, self._character)
1269
1270 - def count(self, sub, start=0, end=sys.maxsize):
1271 """Non-overlapping count method, like that of a python string. 1272 1273 This behaves like the python string (and Seq object) method of the 1274 same name, which does a non-overlapping count! 1275 1276 Returns an integer, the number of occurrences of substring 1277 argument sub in the (sub)sequence given by [start:end]. 1278 Optional arguments start and end are interpreted as in slice 1279 notation. 1280 1281 Arguments: 1282 - sub - a string or another Seq object to look for 1283 - start - optional integer, slice start 1284 - end - optional integer, slice end 1285 1286 >>> "NNNN".count("N") 1287 4 1288 >>> Seq("NNNN").count("N") 1289 4 1290 >>> UnknownSeq(4, character="N").count("N") 1291 4 1292 >>> UnknownSeq(4, character="N").count("A") 1293 0 1294 >>> UnknownSeq(4, character="N").count("AA") 1295 0 1296 1297 HOWEVER, please note because that python strings and Seq objects (and 1298 MutableSeq objects) do a non-overlapping search, this may not give 1299 the answer you expect: 1300 1301 >>> UnknownSeq(4, character="N").count("NN") 1302 2 1303 >>> UnknownSeq(4, character="N").count("NNN") 1304 1 1305 """ 1306 sub_str = self._get_seq_str_and_check_alphabet(sub) 1307 if len(sub_str) == 1: 1308 if str(sub_str) == self._character: 1309 if start==0 and end >= self._length: 1310 return self._length 1311 else: 1312 # This could be done more cleverly... 1313 return str(self).count(sub_str, start, end) 1314 else: 1315 return 0 1316 else: 1317 if set(sub_str) == set(self._character): 1318 if start==0 and end >= self._length: 1319 return self._length // len(sub_str) 1320 else: 1321 # This could be done more cleverly... 1322 return str(self).count(sub_str, start, end) 1323 else: 1324 return 0
1325
1326 - def complement(self):
1327 """The complement of an unknown nucleotide equals itself. 1328 1329 >>> my_nuc = UnknownSeq(8) 1330 >>> my_nuc 1331 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1332 >>> print(my_nuc) 1333 ???????? 1334 >>> my_nuc.complement() 1335 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1336 >>> print(my_nuc.complement()) 1337 ???????? 1338 """ 1339 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1340 Alphabet.ProteinAlphabet): 1341 raise ValueError("Proteins do not have complements!") 1342 return self
1343
1344 - def reverse_complement(self):
1345 """The reverse complement of an unknown nucleotide equals itself. 1346 1347 >>> my_nuc = UnknownSeq(10) 1348 >>> my_nuc 1349 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1350 >>> print(my_nuc) 1351 ?????????? 1352 >>> my_nuc.reverse_complement() 1353 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1354 >>> print(my_nuc.reverse_complement()) 1355 ?????????? 1356 """ 1357 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1358 Alphabet.ProteinAlphabet): 1359 raise ValueError("Proteins do not have complements!") 1360 return self
1361
1362 - def transcribe(self):
1363 """Returns unknown RNA sequence from an unknown DNA sequence. 1364 1365 >>> my_dna = UnknownSeq(10, character="N") 1366 >>> my_dna 1367 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1368 >>> print(my_dna) 1369 NNNNNNNNNN 1370 >>> my_rna = my_dna.transcribe() 1371 >>> my_rna 1372 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1373 >>> print(my_rna) 1374 NNNNNNNNNN 1375 """ 1376 # Offload the alphabet stuff 1377 s = Seq(self._character, self.alphabet).transcribe() 1378 return UnknownSeq(self._length, s.alphabet, self._character)
1379
1380 - def back_transcribe(self):
1381 """Returns unknown DNA sequence from an unknown RNA sequence. 1382 1383 >>> my_rna = UnknownSeq(20, character="N") 1384 >>> my_rna 1385 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1386 >>> print(my_rna) 1387 NNNNNNNNNNNNNNNNNNNN 1388 >>> my_dna = my_rna.back_transcribe() 1389 >>> my_dna 1390 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1391 >>> print(my_dna) 1392 NNNNNNNNNNNNNNNNNNNN 1393 """ 1394 # Offload the alphabet stuff 1395 s = Seq(self._character, self.alphabet).back_transcribe() 1396 return UnknownSeq(self._length, s.alphabet, self._character)
1397
1398 - def upper(self):
1399 """Returns an upper case copy of the sequence. 1400 1401 >>> from Bio.Alphabet import generic_dna 1402 >>> from Bio.Seq import UnknownSeq 1403 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1404 >>> my_seq 1405 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1406 >>> print(my_seq) 1407 nnnnnnnnnnnnnnnnnnnn 1408 >>> my_seq.upper() 1409 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1410 >>> print(my_seq.upper()) 1411 NNNNNNNNNNNNNNNNNNNN 1412 1413 This will adjust the alphabet if required. See also the lower method. 1414 """ 1415 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1416
1417 - def lower(self):
1418 """Returns a lower case copy of the sequence. 1419 1420 This will adjust the alphabet if required: 1421 1422 >>> from Bio.Alphabet import IUPAC 1423 >>> from Bio.Seq import UnknownSeq 1424 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1425 >>> my_seq 1426 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1427 >>> print(my_seq) 1428 XXXXXXXXXXXXXXXXXXXX 1429 >>> my_seq.lower() 1430 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1431 >>> print(my_seq.lower()) 1432 xxxxxxxxxxxxxxxxxxxx 1433 1434 See also the upper method. 1435 """ 1436 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1437
1438 - def translate(self, **kwargs):
1439 """Translate an unknown nucleotide sequence into an unknown protein. 1440 1441 e.g. 1442 1443 >>> my_seq = UnknownSeq(9, character="N") 1444 >>> print(my_seq) 1445 NNNNNNNNN 1446 >>> my_protein = my_seq.translate() 1447 >>> my_protein 1448 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1449 >>> print(my_protein) 1450 XXX 1451 1452 In comparison, using a normal Seq object: 1453 1454 >>> my_seq = Seq("NNNNNNNNN") 1455 >>> print(my_seq) 1456 NNNNNNNNN 1457 >>> my_protein = my_seq.translate() 1458 >>> my_protein 1459 Seq('XXX', ExtendedIUPACProtein()) 1460 >>> print(my_protein) 1461 XXX 1462 1463 """ 1464 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1465 Alphabet.ProteinAlphabet): 1466 raise ValueError("Proteins cannot be translated!") 1467 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1468
1469 - def ungap(self, gap=None):
1470 """Return a copy of the sequence without the gap character(s). 1471 1472 The gap character can be specified in two ways - either as an explicit 1473 argument, or via the sequence's alphabet. For example: 1474 1475 >>> from Bio.Seq import UnknownSeq 1476 >>> from Bio.Alphabet import Gapped, generic_dna 1477 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1478 >>> my_dna 1479 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1480 >>> my_dna.ungap() 1481 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1482 >>> my_dna.ungap("-") 1483 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1484 1485 If the UnknownSeq is using the gap character, then an empty Seq is 1486 returned: 1487 1488 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1489 >>> my_gap 1490 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1491 >>> my_gap.ungap() 1492 Seq('', DNAAlphabet()) 1493 >>> my_gap.ungap("-") 1494 Seq('', DNAAlphabet()) 1495 1496 Notice that the returned sequence's alphabet is adjusted to remove any 1497 explicit gap character declaration. 1498 """ 1499 # Offload the alphabet stuff 1500 s = Seq(self._character, self.alphabet).ungap() 1501 if s: 1502 return UnknownSeq(self._length, s.alphabet, self._character) 1503 else: 1504 return Seq("", s.alphabet)
1505 1506
1507 -class MutableSeq(object):
1508 """An editable sequence object (with an alphabet). 1509 1510 Unlike normal python strings and our basic sequence object (the Seq class) 1511 which are immuatable, the MutableSeq lets you edit the sequence in place. 1512 However, this means you cannot use a MutableSeq object as a dictionary key. 1513 1514 >>> from Bio.Seq import MutableSeq 1515 >>> from Bio.Alphabet import generic_dna 1516 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1517 >>> my_seq 1518 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1519 >>> my_seq[5] 1520 'T' 1521 >>> my_seq[5] = "A" 1522 >>> my_seq 1523 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1524 >>> my_seq[5] 1525 'A' 1526 >>> my_seq[5:8] = "NNN" 1527 >>> my_seq 1528 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1529 >>> len(my_seq) 1530 11 1531 1532 Note that the MutableSeq object does not support as many string-like 1533 or biological methods as the Seq object. 1534 """
1535 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
1536 if sys.version_info[0] == 3: 1537 self.array_indicator = "u" 1538 else: 1539 self.array_indicator = "c" 1540 if isinstance(data, str): # TODO - What about unicode? 1541 self.data = array.array(self.array_indicator, data) 1542 else: 1543 self.data = data # assumes the input is an array 1544 self.alphabet = alphabet
1545
1546 - def __repr__(self):
1547 """Returns a (truncated) representation of the sequence for debugging.""" 1548 if len(self) > 60: 1549 # Shows the last three letters as it is often useful to see if there 1550 # is a stop codon at the end of a sequence. 1551 # Note total length is 54+3+3=60 1552 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1553 str(self[:54]), str(self[-3:]), 1554 repr(self.alphabet)) 1555 else: 1556 return "%s('%s', %s)" % (self.__class__.__name__, 1557 str(self), 1558 repr(self.alphabet))
1559
1560 - def __str__(self):
1561 """Returns the full sequence as a python string. 1562 1563 Note that Biopython 1.44 and earlier would give a truncated 1564 version of repr(my_seq) for str(my_seq). If you are writing code 1565 which needs to be backwards compatible with old Biopython, you 1566 should continue to use my_seq.tostring() rather than str(my_seq). 1567 """ 1568 # See test_GAQueens.py for an historic usage of a non-string alphabet! 1569 return "".join(self.data)
1570
1571 - def __eq__(self, other):
1572 """Compare the sequence to another sequence or a string (README). 1573 1574 Currently if compared to another sequence the alphabets must be 1575 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1576 an exception. Otherwise only the sequence itself is compared, not the 1577 precise alphabet. 1578 1579 A future release of Biopython will change this (and the Seq object etc) 1580 to use simple string comparison. The plan is that comparing sequences 1581 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1582 but not an exception. 1583 1584 During this transition period, please just do explicit comparisons: 1585 1586 >>> seq1 = MutableSeq("ACGT") 1587 >>> seq2 = MutableSeq("ACGT") 1588 >>> id(seq1) == id(seq2) 1589 False 1590 >>> str(seq1) == str(seq2) 1591 True 1592 1593 Biopython now does: 1594 1595 >>> seq1 == seq2 1596 True 1597 >>> seq1 == Seq("ACGT") 1598 True 1599 >>> seq1 == "ACGT" 1600 True 1601 1602 """ 1603 if hasattr(other, "alphabet"): 1604 if not Alphabet._check_type_compatible([self.alphabet, 1605 other.alphabet]): 1606 warnings.warn("Incompatible alphabets %s and %s" 1607 % (repr(self.alphabet), repr(other.alphabet)), 1608 BiopythonWarning) 1609 if isinstance(other, MutableSeq): 1610 return self.data == other.data 1611 return str(self) == str(other)
1612
1613 - def __ne__(self, other):
1614 """Not equal, see __eq__ documentation.""" 1615 # Seem to require this method under Python 2 but not needed on Python 3? 1616 return not (self == other)
1617
1618 - def __lt__(self, other):
1619 """Less than, see __eq__ documentation.""" 1620 if hasattr(other, "alphabet"): 1621 if not Alphabet._check_type_compatible([self.alphabet, 1622 other.alphabet]): 1623 warnings.warn("Incompatible alphabets %s and %s" 1624 % (repr(self.alphabet), repr(other.alphabet)), 1625 BiopythonWarning) 1626 if isinstance(other, MutableSeq): 1627 return self.data < other.data 1628 return str(self) < str(other)
1629
1630 - def __le__(self, other):
1631 """Less than or equal, see __eq__ documentation.""" 1632 if hasattr(other, "alphabet"): 1633 if not Alphabet._check_type_compatible([self.alphabet, 1634 other.alphabet]): 1635 warnings.warn("Incompatible alphabets %s and %s" 1636 % (repr(self.alphabet), repr(other.alphabet)), 1637 BiopythonWarning) 1638 if isinstance(other, MutableSeq): 1639 return self.data <= other.data 1640 return str(self) <= str(other)
1641
1642 - def __len__(self):
1643 return len(self.data)
1644
1645 - def __getitem__(self, index):
1646 # Note since Python 2.0, __getslice__ is deprecated 1647 # and __getitem__ is used instead. 1648 # See http://docs.python.org/ref/sequence-methods.html 1649 if isinstance(index, int): 1650 # Return a single letter as a string 1651 return self.data[index] 1652 else: 1653 # Return the (sub)sequence as another Seq object 1654 return MutableSeq(self.data[index], self.alphabet)
1655
1656 - def __setitem__(self, index, value):
1657 # Note since Python 2.0, __setslice__ is deprecated 1658 # and __setitem__ is used instead. 1659 # See http://docs.python.org/ref/sequence-methods.html 1660 if isinstance(index, int): 1661 # Replacing a single letter with a new string 1662 self.data[index] = value 1663 else: 1664 # Replacing a sub-sequence 1665 if isinstance(value, MutableSeq): 1666 self.data[index] = value.data 1667 elif isinstance(value, type(self.data)): 1668 self.data[index] = value 1669 else: 1670 self.data[index] = array.array(self.array_indicator, 1671 str(value))
1672
1673 - def __delitem__(self, index):
1674 # Note since Python 2.0, __delslice__ is deprecated 1675 # and __delitem__ is used instead. 1676 # See http://docs.python.org/ref/sequence-methods.html 1677 1678 # Could be deleting a single letter, or a slice 1679 del self.data[index]
1680
1681 - def __add__(self, other):
1682 """Add another sequence or string to this sequence. 1683 1684 Returns a new MutableSeq object.""" 1685 if hasattr(other, "alphabet"): 1686 # other should be a Seq or a MutableSeq 1687 if not Alphabet._check_type_compatible([self.alphabet, 1688 other.alphabet]): 1689 raise TypeError("Incompatible alphabets %s and %s" 1690 % (repr(self.alphabet), repr(other.alphabet))) 1691 # They should be the same sequence type (or one of them is generic) 1692 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1693 if isinstance(other, MutableSeq): 1694 # See test_GAQueens.py for an historic usage of a non-string 1695 # alphabet! Adding the arrays should support this. 1696 return self.__class__(self.data + other.data, a) 1697 else: 1698 return self.__class__(str(self) + str(other), a) 1699 elif isinstance(other, basestring): 1700 # other is a plain string - use the current alphabet 1701 return self.__class__(str(self) + str(other), self.alphabet) 1702 else: 1703 raise TypeError
1704
1705 - def __radd__(self, other):
1706 if hasattr(other, "alphabet"): 1707 # other should be a Seq or a MutableSeq 1708 if not Alphabet._check_type_compatible([self.alphabet, 1709 other.alphabet]): 1710 raise TypeError("Incompatible alphabets %s and %s" 1711 % (repr(self.alphabet), repr(other.alphabet))) 1712 # They should be the same sequence type (or one of them is generic) 1713 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1714 if isinstance(other, MutableSeq): 1715 # See test_GAQueens.py for an historic usage of a non-string 1716 # alphabet! Adding the arrays should support this. 1717 return self.__class__(other.data + self.data, a) 1718 else: 1719 return self.__class__(str(other) + str(self), a) 1720 elif isinstance(other, basestring): 1721 # other is a plain string - use the current alphabet 1722 return self.__class__(str(other) + str(self), self.alphabet) 1723 else: 1724 raise TypeError
1725
1726 - def append(self, c):
1727 self.data.append(c)
1728
1729 - def insert(self, i, c):
1730 self.data.insert(i, c)
1731
1732 - def pop(self, i=(-1)):
1733 c = self.data[i] 1734 del self.data[i] 1735 return c
1736
1737 - def remove(self, item):
1738 for i in range(len(self.data)): 1739 if self.data[i] == item: 1740 del self.data[i] 1741 return 1742 raise ValueError("MutableSeq.remove(x): x not in list")
1743
1744 - def count(self, sub, start=0, end=sys.maxsize):
1745 """Non-overlapping count method, like that of a python string. 1746 1747 This behaves like the python string method of the same name, 1748 which does a non-overlapping count! 1749 1750 Returns an integer, the number of occurrences of substring 1751 argument sub in the (sub)sequence given by [start:end]. 1752 Optional arguments start and end are interpreted as in slice 1753 notation. 1754 1755 Arguments: 1756 - sub - a string or another Seq object to look for 1757 - start - optional integer, slice start 1758 - end - optional integer, slice end 1759 1760 e.g. 1761 1762 >>> from Bio.Seq import MutableSeq 1763 >>> my_mseq = MutableSeq("AAAATGA") 1764 >>> print(my_mseq.count("A")) 1765 5 1766 >>> print(my_mseq.count("ATG")) 1767 1 1768 >>> print(my_mseq.count(Seq("AT"))) 1769 1 1770 >>> print(my_mseq.count("AT", 2, -1)) 1771 1 1772 1773 HOWEVER, please note because that python strings, Seq objects and 1774 MutableSeq objects do a non-overlapping search, this may not give 1775 the answer you expect: 1776 1777 >>> "AAAA".count("AA") 1778 2 1779 >>> print(MutableSeq("AAAA").count("AA")) 1780 2 1781 1782 An overlapping search would give the answer as three! 1783 """ 1784 try: 1785 # TODO - Should we check the alphabet? 1786 search = str(sub) 1787 except AttributeError: 1788 search = sub 1789 1790 if not isinstance(search, basestring): 1791 raise TypeError("expected a string, Seq or MutableSeq") 1792 1793 if len(search) == 1: 1794 # Try and be efficient and work directly from the array. 1795 count = 0 1796 for c in self.data[start:end]: 1797 if c == search: 1798 count += 1 1799 return count 1800 else: 1801 # TODO - Can we do this more efficiently? 1802 return str(self).count(search, start, end)
1803
1804 - def index(self, item):
1805 for i in range(len(self.data)): 1806 if self.data[i] == item: 1807 return i 1808 raise ValueError("MutableSeq.index(x): x not in list")
1809
1810 - def reverse(self):
1811 """Modify the mutable sequence to reverse itself. 1812 1813 No return value. 1814 """ 1815 self.data.reverse()
1816
1817 - def complement(self):
1818 """Modify the mutable sequence to take on its complement. 1819 1820 Trying to complement a protein sequence raises an exception. 1821 1822 No return value. 1823 """ 1824 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1825 Alphabet.ProteinAlphabet): 1826 raise ValueError("Proteins do not have complements!") 1827 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1828 d = ambiguous_dna_complement 1829 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1830 d = ambiguous_rna_complement 1831 elif 'U' in self.data and 'T' in self.data: 1832 # TODO - Handle this cleanly? 1833 raise ValueError("Mixed RNA/DNA found") 1834 elif 'U' in self.data: 1835 d = ambiguous_rna_complement 1836 else: 1837 d = ambiguous_dna_complement 1838 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 1839 d.update(c) 1840 self.data = [d[c] for c in self.data] 1841 self.data = array.array(self.array_indicator, self.data)
1842
1843 - def reverse_complement(self):
1844 """Modify the mutable sequence to take on its reverse complement. 1845 1846 Trying to reverse complement a protein sequence raises an exception. 1847 1848 No return value. 1849 """ 1850 self.complement() 1851 self.data.reverse()
1852 1853 # Sorting a sequence makes no sense. 1854 # def sort(self, *args): self.data.sort(*args) 1855
1856 - def extend(self, other):
1857 if isinstance(other, MutableSeq): 1858 for c in other.data: 1859 self.data.append(c) 1860 else: 1861 for c in other: 1862 self.data.append(c)
1863
1864 - def tostring(self):
1865 """Returns the full sequence as a python string (DEPRECATED). 1866 1867 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 1868 as this method is officially deprecated. 1869 1870 Because str(my_seq) will give you the full sequence as a python string, 1871 there is often no need to make an explicit conversion. For example, 1872 1873 print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 1874 1875 On Biopython 1.44 or older you would have to have done this: 1876 1877 print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 1878 """ 1879 from Bio import BiopythonDeprecationWarning 1880 warnings.warn("This method is obsolete; please use str(my_seq) " 1881 "instead of my_seq.tostring().", 1882 BiopythonDeprecationWarning) 1883 return "".join(self.data)
1884
1885 - def toseq(self):
1886 """Returns the full sequence as a new immutable Seq object. 1887 1888 >>> from Bio.Seq import Seq 1889 >>> from Bio.Alphabet import IUPAC 1890 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1891 ... IUPAC.protein) 1892 >>> my_mseq 1893 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1894 >>> my_mseq.toseq() 1895 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1896 1897 Note that the alphabet is preserved. 1898 """ 1899 return Seq("".join(self.data), self.alphabet)
1900 1901 1902 # The transcribe, backward_transcribe, and translate functions are 1903 # user-friendly versions of the corresponding functions in Bio.Transcribe 1904 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1905
1906 -def transcribe(dna):
1907 """Transcribes a DNA sequence into RNA. 1908 1909 If given a string, returns a new string object. 1910 1911 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1912 1913 Trying to transcribe a protein or RNA sequence raises an exception. 1914 1915 e.g. 1916 1917 >>> transcribe("ACTGN") 1918 'ACUGN' 1919 """ 1920 if isinstance(dna, Seq): 1921 return dna.transcribe() 1922 elif isinstance(dna, MutableSeq): 1923 return dna.toseq().transcribe() 1924 else: 1925 return dna.replace('T', 'U').replace('t', 'u')
1926 1927
1928 -def back_transcribe(rna):
1929 """Back-transcribes an RNA sequence into DNA. 1930 1931 If given a string, returns a new string object. 1932 1933 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1934 1935 Trying to transcribe a protein or DNA sequence raises an exception. 1936 1937 e.g. 1938 1939 >>> back_transcribe("ACUGN") 1940 'ACTGN' 1941 """ 1942 if isinstance(rna, Seq): 1943 return rna.back_transcribe() 1944 elif isinstance(rna, MutableSeq): 1945 return rna.toseq().back_transcribe() 1946 else: 1947 return rna.replace('U', 'T').replace('u', 't')
1948 1949
1950 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1951 cds=False, pos_stop="X"):
1952 """Helper function to translate a nucleotide string (PRIVATE). 1953 1954 Arguments: 1955 - sequence - a string 1956 - table - a CodonTable object (NOT a table name or id number) 1957 - stop_symbol - a single character string, what to use for terminators. 1958 - to_stop - boolean, should translation terminate at the first 1959 in frame stop codon? If there is no in-frame stop codon 1960 then translation continues to the end. 1961 - pos_stop - a single character string for a possible stop codon 1962 (e.g. TAN or NNN) 1963 - cds - Boolean, indicates this is a complete CDS. If True, this 1964 checks the sequence starts with a valid alternative start 1965 codon (which will be translated as methionine, M), that the 1966 sequence length is a multiple of three, and that there is a 1967 single in frame stop codon at the end (this will be excluded 1968 from the protein sequence, regardless of the to_stop option). 1969 If these tests fail, an exception is raised. 1970 1971 Returns a string. 1972 1973 e.g. 1974 1975 >>> from Bio.Data import CodonTable 1976 >>> table = CodonTable.ambiguous_dna_by_id[1] 1977 >>> _translate_str("AAA", table) 1978 'K' 1979 >>> _translate_str("TAR", table) 1980 '*' 1981 >>> _translate_str("TAN", table) 1982 'X' 1983 >>> _translate_str("TAN", table, pos_stop="@") 1984 '@' 1985 >>> _translate_str("TA?", table) 1986 Traceback (most recent call last): 1987 ... 1988 TranslationError: Codon 'TA?' is invalid 1989 1990 In a change to older verions of Biopython, partial codons are now 1991 always regarded as an error (previously only checked if cds=True) 1992 and will trigger a warning (likely to become an exception in a 1993 future release). 1994 1995 If **cds=True**, the start and stop codons are checked, and the start 1996 codon will be translated at methionine. The sequence must be an 1997 while number of codons. 1998 1999 >>> _translate_str("ATGCCCTAG", table, cds=True) 2000 'MP' 2001 >>> _translate_str("AAACCCTAG", table, cds=True) 2002 Traceback (most recent call last): 2003 ... 2004 TranslationError: First codon 'AAA' is not a start codon 2005 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 2006 Traceback (most recent call last): 2007 ... 2008 TranslationError: Extra in frame stop codon found. 2009 """ 2010 sequence = sequence.upper() 2011 amino_acids = [] 2012 forward_table = table.forward_table 2013 stop_codons = table.stop_codons 2014 if table.nucleotide_alphabet.letters is not None: 2015 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 2016 else: 2017 # Assume the worst case, ambiguous DNA or RNA: 2018 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 2019 IUPAC.ambiguous_rna.letters.upper()) 2020 n = len(sequence) 2021 if cds: 2022 if str(sequence[:3]).upper() not in table.start_codons: 2023 raise CodonTable.TranslationError( 2024 "First codon '%s' is not a start codon" % sequence[:3]) 2025 if n % 3 != 0: 2026 raise CodonTable.TranslationError( 2027 "Sequence length %i is not a multiple of three" % n) 2028 if str(sequence[-3:]).upper() not in stop_codons: 2029 raise CodonTable.TranslationError( 2030 "Final codon '%s' is not a stop codon" % sequence[-3:]) 2031 # Don't translate the stop symbol, and manually translate the M 2032 sequence = sequence[3:-3] 2033 n -= 6 2034 amino_acids = ["M"] 2035 elif n % 3 != 0: 2036 from Bio import BiopythonWarning 2037 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 2038 "Explicitly trim the sequence or add trailing N before " 2039 "translation. This may become an error in future.", 2040 BiopythonWarning) 2041 for i in range(0, n - n%3, 3): 2042 codon = sequence[i:i+3] 2043 try: 2044 amino_acids.append(forward_table[codon]) 2045 except (KeyError, CodonTable.TranslationError): 2046 # Todo? Treat "---" as a special case (gapped translation) 2047 if codon in table.stop_codons: 2048 if cds: 2049 raise CodonTable.TranslationError( 2050 "Extra in frame stop codon found.") 2051 if to_stop: 2052 break 2053 amino_acids.append(stop_symbol) 2054 elif valid_letters.issuperset(set(codon)): 2055 # Possible stop codon (e.g. NNN or TAN) 2056 amino_acids.append(pos_stop) 2057 else: 2058 raise CodonTable.TranslationError( 2059 "Codon '%s' is invalid" % codon) 2060 return "".join(amino_acids)
2061 2062
2063 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2064 cds=False):
2065 """Translate a nucleotide sequence into amino acids. 2066 2067 If given a string, returns a new string object. Given a Seq or 2068 MutableSeq, returns a Seq object with a protein alphabet. 2069 2070 Arguments: 2071 - table - Which codon table to use? This can be either a name (string), 2072 an NCBI identifier (integer), or a CodonTable object (useful 2073 for non-standard genetic codes). Defaults to the "Standard" 2074 table. 2075 - stop_symbol - Single character string, what to use for any 2076 terminators, defaults to the asterisk, "*". 2077 - to_stop - Boolean, defaults to False meaning do a full 2078 translation continuing on past any stop codons 2079 (translated as the specified stop_symbol). If 2080 True, translation is terminated at the first in 2081 frame stop codon (and the stop_symbol is not 2082 appended to the returned protein sequence). 2083 - cds - Boolean, indicates this is a complete CDS. If True, this 2084 checks the sequence starts with a valid alternative start 2085 codon (which will be translated as methionine, M), that the 2086 sequence length is a multiple of three, and that there is a 2087 single in frame stop codon at the end (this will be excluded 2088 from the protein sequence, regardless of the to_stop option). 2089 If these tests fail, an exception is raised. 2090 2091 A simple string example using the default (standard) genetic code: 2092 2093 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2094 >>> translate(coding_dna) 2095 'VAIVMGR*KGAR*' 2096 >>> translate(coding_dna, stop_symbol="@") 2097 'VAIVMGR@KGAR@' 2098 >>> translate(coding_dna, to_stop=True) 2099 'VAIVMGR' 2100 2101 Now using NCBI table 2, where TGA is not a stop codon: 2102 2103 >>> translate(coding_dna, table=2) 2104 'VAIVMGRWKGAR*' 2105 >>> translate(coding_dna, table=2, to_stop=True) 2106 'VAIVMGRWKGAR' 2107 2108 In fact this example uses an alternative start codon valid under NCBI table 2, 2109 GTG, which means this example is a complete valid CDS which when translated 2110 should really start with methionine (not valine): 2111 2112 >>> translate(coding_dna, table=2, cds=True) 2113 'MAIVMGRWKGAR' 2114 2115 Note that if the sequence has no in-frame stop codon, then the to_stop 2116 argument has no effect: 2117 2118 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2119 >>> translate(coding_dna2) 2120 'VAIVMGR' 2121 >>> translate(coding_dna2, to_stop=True) 2122 'VAIVMGR' 2123 2124 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2125 or a stop codon. These are translated as "X". Any invalid codon 2126 (e.g. "TA?" or "T-A") will throw a TranslationError. 2127 2128 NOTE - Does NOT support gapped sequences. 2129 2130 It will however translate either DNA or RNA. 2131 """ 2132 if isinstance(sequence, Seq): 2133 return sequence.translate(table, stop_symbol, to_stop, cds) 2134 elif isinstance(sequence, MutableSeq): 2135 # Return a Seq object 2136 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2137 else: 2138 # Assume its a string, return a string 2139 try: 2140 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2141 except ValueError: 2142 codon_table = CodonTable.ambiguous_generic_by_name[table] 2143 except (AttributeError, TypeError): 2144 if isinstance(table, CodonTable.CodonTable): 2145 codon_table = table 2146 else: 2147 raise ValueError('Bad table argument') 2148 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2149 2150
2151 -def reverse_complement(sequence):
2152 """Returns the reverse complement sequence of a nucleotide string. 2153 2154 If given a string, returns a new string object. 2155 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2156 2157 Supports unambiguous and ambiguous nucleotide sequences. 2158 2159 e.g. 2160 2161 >>> reverse_complement("ACTG-NH") 2162 'DN-CAGT' 2163 """ 2164 if isinstance(sequence, Seq): 2165 # Return a Seq 2166 return sequence.reverse_complement() 2167 elif isinstance(sequence, MutableSeq): 2168 # Return a Seq 2169 # Don't use the MutableSeq reverse_complement method as it is 'in place'. 2170 return sequence.toseq().reverse_complement() 2171 2172 # Assume its a string. 2173 # In order to avoid some code duplication, the old code would turn the string 2174 # into a Seq, use the reverse_complement method, and convert back to a string. 2175 # This worked, but is over five times slower on short sequences! 2176 if ('U' in sequence or 'u' in sequence) \ 2177 and ('T' in sequence or 't' in sequence): 2178 raise ValueError("Mixed RNA/DNA found") 2179 elif 'U' in sequence or 'u' in sequence: 2180 ttable = _rna_complement_table 2181 else: 2182 ttable = _dna_complement_table 2183 return sequence.translate(ttable)[::-1]
2184 2185
2186 -def _test():
2187 """Run the Bio.Seq module's doctests (PRIVATE).""" 2188 if sys.version_info[0:2] == (3, 1): 2189 print("Not running Bio.Seq doctest on Python 3.1") 2190 print("See http://bugs.python.org/issue7490") 2191 else: 2192 print("Running doctests...") 2193 import doctest 2194 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2195 print("Done")
2196 2197 if __name__ == "__main__": 2198 _test() 2199