Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9   
  10   
  11  from Bio._py3k import basestring 
  12   
  13  __docformat__ = "restructuredtext en"  # Simple markup to show doctests nicely 
  14   
  15  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  16  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  17  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  18  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  19   
  20   
21 -class _RestrictedDict(dict):
22 """Dict which only allows sequences of given length as values (PRIVATE). 23 24 This simple subclass of the Python dictionary is used in the SeqRecord 25 object for holding per-letter-annotations. This class is intended to 26 prevent simple errors by only allowing python sequences (e.g. lists, 27 strings and tuples) to be stored, and only if their length matches that 28 expected (the length of the SeqRecord's seq object). It cannot however 29 prevent the entries being edited in situ (for example appending entries 30 to a list). 31 32 >>> x = _RestrictedDict(5) 33 >>> x["test"] = "hello" 34 >>> x 35 {'test': 'hello'} 36 37 Adding entries which don't have the expected length are blocked: 38 39 >>> x["test"] = "hello world" 40 Traceback (most recent call last): 41 ... 42 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 43 44 The expected length is stored as a private attribute, 45 46 >>> x._length 47 5 48 49 In order that the SeqRecord (and other objects using this class) can be 50 pickled, for example for use in the multiprocessing library, we need to 51 be able to pickle the restricted dictionary objects. 52 53 Using the default protocol, which is 0 on Python 2.x, 54 55 >>> import pickle 56 >>> y = pickle.loads(pickle.dumps(x)) 57 >>> y 58 {'test': 'hello'} 59 >>> y._length 60 5 61 62 Using the highest protocol, which is 2 on Python 2.x, 63 64 >>> import pickle 65 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 66 >>> z 67 {'test': 'hello'} 68 >>> z._length 69 5 70 """ 71
72 - def __init__(self, length):
73 """Create an EMPTY restricted dictionary.""" 74 dict.__init__(self) 75 self._length = int(length)
76
77 - def __setitem__(self, key, value):
78 # The check hasattr(self, "_length") is to cope with pickle protocol 2 79 # I couldn't seem to avoid this with __getstate__ and __setstate__ 80 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 81 or (hasattr(self, "_length") and len(value) != self._length): 82 raise TypeError("We only allow python sequences (lists, tuples or " 83 "strings) of length {0}.".format(self._length)) 84 dict.__setitem__(self, key, value)
85
86 - def update(self, new_dict):
87 # Force this to go via our strict __setitem__ method 88 for (key, value) in new_dict.items(): 89 self[key] = value
90 91
92 -class SeqRecord(object):
93 """A SeqRecord object holds a sequence and information about it. 94 95 Main attributes: 96 - id - Identifier such as a locus tag (string) 97 - seq - The sequence itself (Seq object or similar) 98 99 Additional attributes: 100 - name - Sequence name, e.g. gene name (string) 101 - description - Additional text (string) 102 - dbxrefs - List of database cross references (list of strings) 103 - features - Any (sub)features defined (list of SeqFeature objects) 104 - annotations - Further information about the whole sequence (dictionary). 105 Most entries are strings, or lists of strings. 106 - letter_annotations - Per letter/symbol annotation (restricted 107 dictionary). This holds Python sequences (lists, strings 108 or tuples) whose length matches that of the sequence. 109 A typical use would be to hold a list of integers 110 representing sequencing quality scores, or a string 111 representing the secondary structure. 112 113 You will typically use Bio.SeqIO to read in sequences from files as 114 SeqRecord objects. However, you may want to create your own SeqRecord 115 objects directly (see the __init__ method for further details): 116 117 >>> from Bio.Seq import Seq 118 >>> from Bio.SeqRecord import SeqRecord 119 >>> from Bio.Alphabet import IUPAC 120 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 121 ... IUPAC.protein), 122 ... id="YP_025292.1", name="HokC", 123 ... description="toxic membrane protein") 124 >>> print(record) 125 ID: YP_025292.1 126 Name: HokC 127 Description: toxic membrane protein 128 Number of features: 0 129 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 130 131 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 132 for this. For the special case where you want the SeqRecord turned into 133 a string in a particular file format there is a format method which uses 134 Bio.SeqIO internally: 135 136 >>> print(record.format("fasta")) 137 >YP_025292.1 toxic membrane protein 138 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 139 <BLANKLINE> 140 141 You can also do things like slicing a SeqRecord, checking its length, etc 142 143 >>> len(record) 144 44 145 >>> edited = record[:10] + record[11:] 146 >>> print(edited.seq) 147 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 148 >>> print(record.seq) 149 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 150 151 """ 152
153 - def __init__(self, seq, id="<unknown id>", name="<unknown name>", 154 description="<unknown description>", dbxrefs=None, 155 features=None, annotations=None, 156 letter_annotations=None):
157 """Create a SeqRecord. 158 159 Arguments: 160 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 161 - id - Sequence identifier, recommended (string) 162 - name - Sequence name, optional (string) 163 - description - Sequence description, optional (string) 164 - dbxrefs - Database cross references, optional (list of strings) 165 - features - Any (sub)features, optional (list of SeqFeature objects) 166 - annotations - Dictionary of annotations for the whole sequence 167 - letter_annotations - Dictionary of per-letter-annotations, values 168 should be strings, list or tuples of the same 169 length as the full sequence. 170 171 You will typically use Bio.SeqIO to read in sequences from files as 172 SeqRecord objects. However, you may want to create your own SeqRecord 173 objects directly. 174 175 Note that while an id is optional, we strongly recommend you supply a 176 unique id string for each record. This is especially important 177 if you wish to write your sequences to a file. 178 179 If you don't have the actual sequence, but you do know its length, 180 then using the UnknownSeq object from Bio.Seq is appropriate. 181 182 You can create a 'blank' SeqRecord object, and then populate the 183 attributes later. 184 """ 185 if id is not None and not isinstance(id, basestring): 186 # Lots of existing code uses id=None... this may be a bad idea. 187 raise TypeError("id argument should be a string") 188 if not isinstance(name, basestring): 189 raise TypeError("name argument should be a string") 190 if not isinstance(description, basestring): 191 raise TypeError("description argument should be a string") 192 self._seq = seq 193 self.id = id 194 self.name = name 195 self.description = description 196 197 # database cross references (for the whole sequence) 198 if dbxrefs is None: 199 dbxrefs = [] 200 elif not isinstance(dbxrefs, list): 201 raise TypeError("dbxrefs argument should be a list (of strings)") 202 self.dbxrefs = dbxrefs 203 204 # annotations about the whole sequence 205 if annotations is None: 206 annotations = {} 207 elif not isinstance(annotations, dict): 208 raise TypeError("annotations argument should be a dict") 209 self.annotations = annotations 210 211 if letter_annotations is None: 212 # annotations about each letter in the sequence 213 if seq is None: 214 # Should we allow this and use a normal unrestricted dict? 215 self._per_letter_annotations = _RestrictedDict(length=0) 216 else: 217 try: 218 self._per_letter_annotations = \ 219 _RestrictedDict(length=len(seq)) 220 except: 221 raise TypeError("seq argument should be a Seq object or similar") 222 else: 223 # This will be handled via the property set function, which will 224 # turn this into a _RestrictedDict and thus ensure all the values 225 # in the dict are the right length 226 self.letter_annotations = letter_annotations 227 228 # annotations about parts of the sequence 229 if features is None: 230 features = [] 231 elif not isinstance(features, list): 232 raise TypeError("features argument should be a list (of SeqFeature objects)") 233 self.features = features
234 235 # TODO - Just make this a read only property?
236 - def _set_per_letter_annotations(self, value):
237 if not isinstance(value, dict): 238 raise TypeError("The per-letter-annotations should be a " 239 "(restricted) dictionary.") 240 # Turn this into a restricted-dictionary (and check the entries) 241 try: 242 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 243 except AttributeError: 244 # e.g. seq is None 245 self._per_letter_annotations = _RestrictedDict(length=0) 246 self._per_letter_annotations.update(value)
247 letter_annotations = property( 248 fget=lambda self: self._per_letter_annotations, 249 fset=_set_per_letter_annotations, 250 doc="""Dictionary of per-letter-annotation for the sequence. 251 252 For example, this can hold quality scores used in FASTQ or QUAL files. 253 Consider this example using Bio.SeqIO to read in an example Solexa 254 variant FASTQ file as a SeqRecord: 255 256 >>> from Bio import SeqIO 257 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 258 >>> print("%s %s" % (record.id, record.seq)) 259 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 260 >>> print(list(record.letter_annotations)) 261 ['solexa_quality'] 262 >>> print(record.letter_annotations["solexa_quality"]) 263 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 264 265 The letter_annotations get sliced automatically if you slice the 266 parent SeqRecord, for example taking the last ten bases: 267 268 >>> sub_record = record[-10:] 269 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 270 slxa_0001_1_0001_01 ACGTNNNNNN 271 >>> print(sub_record.letter_annotations["solexa_quality"]) 272 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 273 274 Any python sequence (i.e. list, tuple or string) can be recorded in 275 the SeqRecord's letter_annotations dictionary as long as the length 276 matches that of the SeqRecord's sequence. e.g. 277 278 >>> len(sub_record.letter_annotations) 279 1 280 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 281 >>> len(sub_record.letter_annotations) 282 2 283 284 You can delete entries from the letter_annotations dictionary as usual: 285 286 >>> del sub_record.letter_annotations["solexa_quality"] 287 >>> sub_record.letter_annotations 288 {'dummy': 'abcdefghij'} 289 290 You can completely clear the dictionary easily as follows: 291 292 >>> sub_record.letter_annotations = {} 293 >>> sub_record.letter_annotations 294 {} 295 """) 296
297 - def _set_seq(self, value):
298 # TODO - Add a deprecation warning that the seq should be write only? 299 if self._per_letter_annotations: 300 # TODO - Make this a warning? Silently empty the dictionary? 301 raise ValueError("You must empty the letter annotations first!") 302 self._seq = value 303 try: 304 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 305 except AttributeError: 306 # e.g. seq is None 307 self._per_letter_annotations = _RestrictedDict(length=0)
308 309 seq = property(fget=lambda self: self._seq, 310 fset=_set_seq, 311 doc="The sequence itself, as a Seq or MutableSeq object.") 312
313 - def __getitem__(self, index):
314 """Returns a sub-sequence or an individual letter. 315 316 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 317 that sub-sequence with appropriate annotation preserved. The 318 name, id and description are kept. 319 320 Any per-letter-annotations are sliced to match the requested 321 sub-sequence. Unless a stride is used, all those features 322 which fall fully within the subsequence are included (with 323 their locations adjusted accordingly). 324 325 However, the annotations dictionary and the dbxrefs list are 326 not used for the new SeqRecord, as in general they may not 327 apply to the subsequence. If you want to preserve them, you 328 must explicitly copy them to the new SeqRecord yourself. 329 330 Using an integer index, e.g. my_record[5] is shorthand for 331 extracting that letter from the sequence, my_record.seq[5]. 332 333 For example, consider this short protein and its secondary 334 structure as encoded by the PDB (e.g. H for alpha helices), 335 plus a simple feature for its histidine self phosphorylation 336 site: 337 338 >>> from Bio.Seq import Seq 339 >>> from Bio.SeqRecord import SeqRecord 340 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 341 >>> from Bio.Alphabet import IUPAC 342 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 343 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 344 ... IUPAC.protein), 345 ... id="1JOY", name="EnvZ", 346 ... description="Homodimeric domain of EnvZ from E. coli") 347 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 348 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 349 ... type = "Site")) 350 351 Now let's have a quick look at the full record, 352 353 >>> print(rec) 354 ID: 1JOY 355 Name: EnvZ 356 Description: Homodimeric domain of EnvZ from E. coli 357 Number of features: 1 358 Per letter annotation for: secondary_structure 359 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 360 >>> print(rec.letter_annotations["secondary_structure"]) 361 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 362 >>> print(rec.features[0].location) 363 [20:21] 364 365 Now let's take a sub sequence, here chosen as the first (fractured) 366 alpha helix which includes the histidine phosphorylation site: 367 368 >>> sub = rec[11:41] 369 >>> print(sub) 370 ID: 1JOY 371 Name: EnvZ 372 Description: Homodimeric domain of EnvZ from E. coli 373 Number of features: 1 374 Per letter annotation for: secondary_structure 375 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 376 >>> print(sub.letter_annotations["secondary_structure"]) 377 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 378 >>> print(sub.features[0].location) 379 [9:10] 380 381 You can also of course omit the start or end values, for 382 example to get the first ten letters only: 383 384 >>> print(rec[:10]) 385 ID: 1JOY 386 Name: EnvZ 387 Description: Homodimeric domain of EnvZ from E. coli 388 Number of features: 0 389 Per letter annotation for: secondary_structure 390 Seq('MAAGVKQLAD', IUPACProtein()) 391 392 Or for the last ten letters: 393 394 >>> print(rec[-10:]) 395 ID: 1JOY 396 Name: EnvZ 397 Description: Homodimeric domain of EnvZ from E. coli 398 Number of features: 0 399 Per letter annotation for: secondary_structure 400 Seq('IIEQFIDYLR', IUPACProtein()) 401 402 If you omit both, then you get a copy of the original record (although 403 lacking the annotations and dbxrefs): 404 405 >>> print(rec[:]) 406 ID: 1JOY 407 Name: EnvZ 408 Description: Homodimeric domain of EnvZ from E. coli 409 Number of features: 1 410 Per letter annotation for: secondary_structure 411 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 412 413 Finally, indexing with a simple integer is shorthand for pulling out 414 that letter from the sequence directly: 415 416 >>> rec[5] 417 'K' 418 >>> rec.seq[5] 419 'K' 420 """ 421 if isinstance(index, int): 422 # NOTE - The sequence level annotation like the id, name, etc 423 # do not really apply to a single character. However, should 424 # we try and expose any per-letter-annotation here? If so how? 425 return self.seq[index] 426 elif isinstance(index, slice): 427 if self.seq is None: 428 raise ValueError("If the sequence is None, we cannot slice it.") 429 parent_length = len(self) 430 answer = self.__class__(self.seq[index], 431 id=self.id, 432 name=self.name, 433 description=self.description) 434 # TODO - The description may no longer apply. 435 # It would be safer to change it to something 436 # generic like "edited" or the default value. 437 438 # Don't copy the annotation dict and dbxefs list, 439 # they may not apply to a subsequence. 440 # answer.annotations = dict(self.annotations.items()) 441 # answer.dbxrefs = self.dbxrefs[:] 442 # TODO - Review this in light of adding SeqRecord objects? 443 444 # TODO - Cope with strides by generating ambiguous locations? 445 start, stop, step = index.indices(parent_length) 446 if step == 1: 447 # Select relevant features, add them with shifted locations 448 # assert str(self.seq)[index] == str(self.seq)[start:stop] 449 for f in self.features: 450 if f.ref or f.ref_db: 451 # TODO - Implement this (with lots of tests)? 452 import warnings 453 warnings.warn("When slicing SeqRecord objects, any " 454 "SeqFeature referencing other sequences (e.g. " 455 "from segmented GenBank records) are ignored.") 456 continue 457 if start <= f.location.nofuzzy_start \ 458 and f.location.nofuzzy_end <= stop: 459 answer.features.append(f._shift(-start)) 460 461 # Slice all the values to match the sliced sequence 462 # (this should also work with strides, even negative strides): 463 for key, value in self.letter_annotations.items(): 464 answer._per_letter_annotations[key] = value[index] 465 466 return answer 467 raise ValueError("Invalid index")
468
469 - def __iter__(self):
470 """Iterate over the letters in the sequence. 471 472 For example, using Bio.SeqIO to read in a protein FASTA file: 473 474 >>> from Bio import SeqIO 475 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 476 >>> for amino in record: 477 ... print(amino) 478 ... if amino == "L": break 479 X 480 A 481 G 482 L 483 >>> print(record.seq[3]) 484 L 485 486 This is just a shortcut for iterating over the sequence directly: 487 488 >>> for amino in record.seq: 489 ... print(amino) 490 ... if amino == "L": break 491 X 492 A 493 G 494 L 495 >>> print(record.seq[3]) 496 L 497 498 Note that this does not facilitate iteration together with any 499 per-letter-annotation. However, you can achieve that using the 500 python zip function on the record (or its sequence) and the relevant 501 per-letter-annotation: 502 503 >>> from Bio import SeqIO 504 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 505 >>> print("%s %s" % (rec.id, rec.seq)) 506 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 507 >>> print(list(rec.letter_annotations)) 508 ['solexa_quality'] 509 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 510 ... if qual > 35: 511 ... print("%s %i" % (nuc, qual)) 512 A 40 513 C 39 514 G 38 515 T 37 516 A 36 517 518 You may agree that using zip(rec.seq, ...) is more explicit than using 519 zip(rec, ...) as shown above. 520 """ 521 return iter(self.seq)
522
523 - def __contains__(self, char):
524 """Implements the 'in' keyword, searches the sequence. 525 526 e.g. 527 528 >>> from Bio import SeqIO 529 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 530 >>> "GAATTC" in record 531 False 532 >>> "AAA" in record 533 True 534 535 This essentially acts as a proxy for using "in" on the sequence: 536 537 >>> "GAATTC" in record.seq 538 False 539 >>> "AAA" in record.seq 540 True 541 542 Note that you can also use Seq objects as the query, 543 544 >>> from Bio.Seq import Seq 545 >>> from Bio.Alphabet import generic_dna 546 >>> Seq("AAA") in record 547 True 548 >>> Seq("AAA", generic_dna) in record 549 True 550 551 See also the Seq object's __contains__ method. 552 """ 553 return char in self.seq
554
555 - def __str__(self):
556 """A human readable summary of the record and its annotation (string). 557 558 The python built in function str works by calling the object's ___str__ 559 method. e.g. 560 561 >>> from Bio.Seq import Seq 562 >>> from Bio.SeqRecord import SeqRecord 563 >>> from Bio.Alphabet import IUPAC 564 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 565 ... IUPAC.protein), 566 ... id="YP_025292.1", name="HokC", 567 ... description="toxic membrane protein, small") 568 >>> print(str(record)) 569 ID: YP_025292.1 570 Name: HokC 571 Description: toxic membrane protein, small 572 Number of features: 0 573 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 574 575 In this example you don't actually need to call str explicity, as the 576 print command does this automatically: 577 578 >>> print(record) 579 ID: YP_025292.1 580 Name: HokC 581 Description: toxic membrane protein, small 582 Number of features: 0 583 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 584 585 Note that long sequences are shown truncated. 586 """ 587 lines = [] 588 if self.id: 589 lines.append("ID: {0}".format(self.id)) 590 if self.name: 591 lines.append("Name: {0}".format(self.name)) 592 if self.description: 593 lines.append("Description: {0}".format(self.description)) 594 if self.dbxrefs: 595 lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) 596 lines.append("Number of features: {0}".format(len(self.features))) 597 for a in self.annotations: 598 lines.append("/{0}={1}".format(a, str(self.annotations[a]))) 599 if self.letter_annotations: 600 lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) 601 # Don't want to include the entire sequence, 602 # and showing the alphabet is useful: 603 lines.append(repr(self.seq)) 604 return "\n".join(lines)
605
606 - def __repr__(self):
607 """A concise summary of the record for debugging (string). 608 609 The python built in function repr works by calling the object's ___repr__ 610 method. e.g. 611 612 >>> from Bio.Seq import Seq 613 >>> from Bio.SeqRecord import SeqRecord 614 >>> from Bio.Alphabet import generic_protein 615 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 616 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 617 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 618 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 619 ... generic_protein), 620 ... id="NP_418483.1", name="b4059", 621 ... description="ssDNA-binding protein", 622 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 623 >>> print(repr(rec)) 624 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 625 626 At the python prompt you can also use this shorthand: 627 628 >>> rec 629 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 630 631 Note that long sequences are shown truncated. Also note that any 632 annotations, letter_annotations and features are not shown (as they 633 would lead to a very long string). 634 """ 635 return "{0}(seq={1!r}, id={2!r}, name={3!r}, description={4!r}, dbxrefs={5!r})".format( 636 self.__class__.__name__, 637 self.seq, self.id, self.name, 638 self.description, self.dbxrefs)
639
640 - def format(self, format):
641 r"""Returns the record as a string in the specified file format. 642 643 The format should be a lower case string supported as an output 644 format by Bio.SeqIO, which is used to turn the SeqRecord into a 645 string. e.g. 646 647 >>> from Bio.Seq import Seq 648 >>> from Bio.SeqRecord import SeqRecord 649 >>> from Bio.Alphabet import IUPAC 650 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 651 ... IUPAC.protein), 652 ... id="YP_025292.1", name="HokC", 653 ... description="toxic membrane protein") 654 >>> record.format("fasta") 655 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 656 >>> print(record.format("fasta")) 657 >YP_025292.1 toxic membrane protein 658 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 659 <BLANKLINE> 660 661 The python print command automatically appends a new line, meaning 662 in this example a blank line is shown. If you look at the string 663 representation you can see there is a trailing new line (shown as 664 slash n) which is important when writing to a file or if 665 concatenating multiple sequence strings together. 666 667 Note that this method will NOT work on every possible file format 668 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 669 """ 670 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 671 # See also the Bio.Align.Generic.Alignment class and its format() 672 return self.__format__(format)
673
674 - def __format__(self, format_spec):
675 """Returns the record as a string in the specified file format. 676 677 This method supports the python format() function added in 678 Python 2.6/3.0. The format_spec should be a lower case string 679 supported by Bio.SeqIO as an output file format. See also the 680 SeqRecord's format() method. 681 682 Under Python 3 please note that for binary formats a bytes 683 string is returned, otherwise a (unicode) string is returned. 684 """ 685 if not format_spec: 686 # Follow python convention and default to using __str__ 687 return str(self) 688 from Bio import SeqIO 689 if format_spec in SeqIO._BinaryFormats: 690 # Return bytes on Python 3 691 from io import BytesIO 692 handle = BytesIO() 693 else: 694 from Bio._py3k import StringIO 695 handle = StringIO() 696 SeqIO.write(self, handle, format_spec) 697 return handle.getvalue()
698
699 - def __len__(self):
700 """Returns the length of the sequence. 701 702 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 703 704 >>> from Bio import SeqIO 705 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 706 >>> len(record) 707 309 708 >>> len(record.seq) 709 309 710 """ 711 return len(self.seq)
712 713 # Python 3:
714 - def __bool__(self):
715 """Boolean value of an instance of this class (True). 716 717 This behaviour is for backwards compatibility, since until the 718 __len__ method was added, a SeqRecord always evaluated as True. 719 720 Note that in comparison, a Seq object will evaluate to False if it 721 has a zero length sequence. 722 723 WARNING: The SeqRecord may in future evaluate to False when its 724 sequence is of zero length (in order to better match the Seq 725 object behaviour)! 726 """ 727 return True
728 729 # Python 2: 730 __nonzero__ = __bool__ 731
732 - def __add__(self, other):
733 """Add another sequence or string to this sequence. 734 735 The other sequence can be a SeqRecord object, a Seq object (or 736 similar, e.g. a MutableSeq) or a plain Python string. If you add 737 a plain string or a Seq (like) object, the new SeqRecord will simply 738 have this appended to the existing data. However, any per letter 739 annotation will be lost: 740 741 >>> from Bio import SeqIO 742 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 743 >>> print("%s %s" % (record.id, record.seq)) 744 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 745 >>> print(list(record.letter_annotations)) 746 ['solexa_quality'] 747 748 >>> new = record + "ACT" 749 >>> print("%s %s" % (new.id, new.seq)) 750 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 751 >>> print(list(new.letter_annotations)) 752 [] 753 754 The new record will attempt to combine the annotation, but for any 755 ambiguities (e.g. different names) it defaults to omitting that 756 annotation. 757 758 >>> from Bio import SeqIO 759 >>> with open("GenBank/pBAD30.gb") as handle: 760 ... plasmid = SeqIO.read(handle, "gb") 761 >>> print("%s %i" % (plasmid.id, len(plasmid))) 762 pBAD30 4923 763 764 Now let's cut the plasmid into two pieces, and join them back up the 765 other way round (i.e. shift the starting point on this plasmid, have 766 a look at the annotated features in the original file to see why this 767 particular split point might make sense): 768 769 >>> left = plasmid[:3765] 770 >>> right = plasmid[3765:] 771 >>> new = right + left 772 >>> print("%s %i" % (new.id, len(new))) 773 pBAD30 4923 774 >>> str(new.seq) == str(right.seq + left.seq) 775 True 776 >>> len(new.features) == len(left.features) + len(right.features) 777 True 778 779 When we add the left and right SeqRecord objects, their annotation 780 is all consistent, so it is all conserved in the new SeqRecord: 781 782 >>> new.id == left.id == right.id == plasmid.id 783 True 784 >>> new.name == left.name == right.name == plasmid.name 785 True 786 >>> new.description == plasmid.description 787 True 788 >>> new.annotations == left.annotations == right.annotations 789 True 790 >>> new.letter_annotations == plasmid.letter_annotations 791 True 792 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 793 True 794 795 However, we should point out that when we sliced the SeqRecord, 796 any annotations dictionary or dbxrefs list entries were lost. 797 You can explicitly copy them like this: 798 799 >>> new.annotations = plasmid.annotations.copy() 800 >>> new.dbxrefs = plasmid.dbxrefs[:] 801 """ 802 if not isinstance(other, SeqRecord): 803 # Assume it is a string or a Seq. 804 # Note can't transfer any per-letter-annotations 805 return SeqRecord(self.seq + other, 806 id=self.id, name=self.name, 807 description=self.description, 808 features=self.features[:], 809 annotations=self.annotations.copy(), 810 dbxrefs=self.dbxrefs[:]) 811 # Adding two SeqRecord objects... must merge annotation. 812 answer = SeqRecord(self.seq + other.seq, 813 features=self.features[:], 814 dbxrefs=self.dbxrefs[:]) 815 # Will take all the features and all the db cross refs, 816 l = len(self) 817 for f in other.features: 818 answer.features.append(f._shift(l)) 819 del l 820 for ref in other.dbxrefs: 821 if ref not in answer.dbxrefs: 822 answer.dbxrefs.append(ref) 823 # Take common id/name/description/annotation 824 if self.id == other.id: 825 answer.id = self.id 826 if self.name == other.name: 827 answer.name = self.name 828 if self.description == other.description: 829 answer.description = self.description 830 for k, v in self.annotations.items(): 831 if k in other.annotations and other.annotations[k] == v: 832 answer.annotations[k] = v 833 # Can append matching per-letter-annotation 834 for k, v in self.letter_annotations.items(): 835 if k in other.letter_annotations: 836 answer.letter_annotations[k] = v + other.letter_annotations[k] 837 return answer
838
839 - def __radd__(self, other):
840 """Add another sequence or string to this sequence (from the left). 841 842 This method handles adding a Seq object (or similar, e.g. MutableSeq) 843 or a plain Python string (on the left) to a SeqRecord (on the right). 844 See the __add__ method for more details, but for example: 845 846 >>> from Bio import SeqIO 847 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 848 >>> print("%s %s" % (record.id, record.seq)) 849 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 850 >>> print(list(record.letter_annotations)) 851 ['solexa_quality'] 852 853 >>> new = "ACT" + record 854 >>> print("%s %s" % (new.id, new.seq)) 855 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 856 >>> print(list(new.letter_annotations)) 857 [] 858 """ 859 if isinstance(other, SeqRecord): 860 raise RuntimeError("This should have happened via the __add__ of " 861 "the other SeqRecord being added!") 862 # Assume it is a string or a Seq. 863 # Note can't transfer any per-letter-annotations 864 offset = len(other) 865 return SeqRecord(other + self.seq, 866 id=self.id, name=self.name, 867 description=self.description, 868 features=[f._shift(offset) for f in self.features], 869 annotations=self.annotations.copy(), 870 dbxrefs=self.dbxrefs[:])
871
872 - def upper(self):
873 """Returns a copy of the record with an upper case sequence. 874 875 All the annotation is preserved unchanged. e.g. 876 877 >>> from Bio.Alphabet import generic_dna 878 >>> from Bio.Seq import Seq 879 >>> from Bio.SeqRecord import SeqRecord 880 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 881 ... description = "Made up for this example") 882 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 883 >>> print(record.upper().format("fastq")) 884 @Test Made up for this example 885 ACGTACGT 886 + 887 "#$%&'() 888 <BLANKLINE> 889 890 Naturally, there is a matching lower method: 891 892 >>> print(record.lower().format("fastq")) 893 @Test Made up for this example 894 acgtacgt 895 + 896 "#$%&'() 897 <BLANKLINE> 898 """ 899 return SeqRecord(self.seq.upper(), 900 id=self.id, name=self.name, 901 description=self.description, 902 dbxrefs=self.dbxrefs[:], 903 features=self.features[:], 904 annotations=self.annotations.copy(), 905 letter_annotations=self.letter_annotations.copy())
906
907 - def lower(self):
908 """Returns a copy of the record with a lower case sequence. 909 910 All the annotation is preserved unchanged. e.g. 911 912 >>> from Bio import SeqIO 913 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 914 >>> print(record.format("fasta")) 915 >gi|3298468|dbj|BAA31520.1| SAMIPF 916 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 917 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 918 <BLANKLINE> 919 >>> print(record.lower().format("fasta")) 920 >gi|3298468|dbj|BAA31520.1| SAMIPF 921 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 922 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 923 <BLANKLINE> 924 925 To take a more annotation rich example, 926 927 >>> from Bio import SeqIO 928 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 929 >>> len(old.features) 930 3 931 >>> new = old.lower() 932 >>> len(old.features) == len(new.features) 933 True 934 >>> old.annotations["organism"] == new.annotations["organism"] 935 True 936 >>> old.dbxrefs == new.dbxrefs 937 True 938 """ 939 return SeqRecord(self.seq.lower(), 940 id=self.id, name=self.name, 941 description=self.description, 942 dbxrefs=self.dbxrefs[:], 943 features=self.features[:], 944 annotations=self.annotations.copy(), 945 letter_annotations=self.letter_annotations.copy())
946
947 - def reverse_complement(self, id=False, name=False, description=False, 948 features=True, annotations=False, 949 letter_annotations=True, dbxrefs=False):
950 """Returns new SeqRecord with reverse complement sequence. 951 952 You can specify the returned record's id, name and description as 953 strings, or True to keep that of the parent, or False for a default. 954 955 You can specify the returned record's features with a list of 956 SeqFeature objects, or True to keep that of the parent, or False to 957 omit them. The default is to keep the original features (with the 958 strand and locations adjusted). 959 960 You can also specify both the returned record's annotations and 961 letter_annotations as dictionaries, True to keep that of the parent, 962 or False to omit them. The default is to keep the original 963 annotations (with the letter annotations reversed). 964 965 To show what happens to the pre-letter annotations, consider an 966 example Solexa variant FASTQ file with a single entry, which we'll 967 read in as a SeqRecord: 968 969 >>> from Bio import SeqIO 970 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 971 >>> print("%s %s" % (record.id, record.seq)) 972 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 973 >>> print(list(record.letter_annotations)) 974 ['solexa_quality'] 975 >>> print(record.letter_annotations["solexa_quality"]) 976 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 977 978 Now take the reverse complement, 979 980 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 981 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 982 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 983 984 Notice that the per-letter-annotations have also been reversed, 985 although this may not be appropriate for all cases. 986 987 >>> print(rc_record.letter_annotations["solexa_quality"]) 988 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 989 990 Now for the features, we need a different example. Parsing a GenBank 991 file is probably the easiest way to get an nice example with features 992 in it... 993 994 >>> from Bio import SeqIO 995 >>> with open("GenBank/pBAD30.gb") as handle: 996 ... plasmid = SeqIO.read(handle, "gb") 997 >>> print("%s %i" % (plasmid.id, len(plasmid))) 998 pBAD30 4923 999 >>> plasmid.seq 1000 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1001 >>> len(plasmid.features) 1002 13 1003 1004 Now, let's take the reverse complement of this whole plasmid: 1005 1006 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1007 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1008 pBAD30_rc 4923 1009 >>> rc_plasmid.seq 1010 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1011 >>> len(rc_plasmid.features) 1012 13 1013 1014 Let's compare the first CDS feature - it has gone from being the 1015 second feature (index 1) to the second last feature (index -2), its 1016 strand has changed, and the location switched round. 1017 1018 >>> print(plasmid.features[1]) 1019 type: CDS 1020 location: [1081:1960](-) 1021 qualifiers: 1022 Key: label, Value: ['araC'] 1023 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1024 Key: vntifkey, Value: ['4'] 1025 <BLANKLINE> 1026 >>> print(rc_plasmid.features[-2]) 1027 type: CDS 1028 location: [2963:3842](+) 1029 qualifiers: 1030 Key: label, Value: ['araC'] 1031 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1032 Key: vntifkey, Value: ['4'] 1033 <BLANKLINE> 1034 1035 You can check this new location, based on the length of the plasmid: 1036 1037 >>> len(plasmid) - 1081 1038 3842 1039 >>> len(plasmid) - 1960 1040 2963 1041 1042 Note that if the SeqFeature annotation includes any strand specific 1043 information (e.g. base changes for a SNP), this information is not 1044 amended, and would need correction after the reverse complement. 1045 1046 Note trying to reverse complement a protein SeqRecord raises an 1047 exception: 1048 1049 >>> from Bio.SeqRecord import SeqRecord 1050 >>> from Bio.Seq import Seq 1051 >>> from Bio.Alphabet import IUPAC 1052 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1053 >>> protein_rec.reverse_complement() 1054 Traceback (most recent call last): 1055 ... 1056 ValueError: Proteins do not have complements! 1057 1058 Also note you can reverse complement a SeqRecord using a MutableSeq: 1059 1060 >>> from Bio.SeqRecord import SeqRecord 1061 >>> from Bio.Seq import MutableSeq 1062 >>> from Bio.Alphabet import generic_dna 1063 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1064 >>> rec.seq[0] = "T" 1065 >>> print("%s %s" % (rec.id, rec.seq)) 1066 Test TCGT 1067 >>> rc = rec.reverse_complement(id=True) 1068 >>> print("%s %s" % (rc.id, rc.seq)) 1069 Test ACGA 1070 """ 1071 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1072 if isinstance(self.seq, MutableSeq): 1073 # Currently the MutableSeq reverse complement is in situ 1074 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1075 else: 1076 answer = SeqRecord(self.seq.reverse_complement()) 1077 if isinstance(id, basestring): 1078 answer.id = id 1079 elif id: 1080 answer.id = self.id 1081 if isinstance(name, basestring): 1082 answer.name = name 1083 elif name: 1084 answer.name = self.name 1085 if isinstance(description, basestring): 1086 answer.description = description 1087 elif description: 1088 answer.description = self.description 1089 if isinstance(dbxrefs, list): 1090 answer.dbxrefs = dbxrefs 1091 elif dbxrefs: 1092 # Copy the old dbxrefs 1093 answer.dbxrefs = self.dbxrefs[:] 1094 if isinstance(features, list): 1095 answer.features = features 1096 elif features: 1097 # Copy the old features, adjusting location and string 1098 l = len(answer) 1099 answer.features = [f._flip(l) for f in self.features] 1100 # The old list should have been sorted by start location, 1101 # reversing it will leave it sorted by what is now the end position, 1102 # so we need to resort in case of overlapping features. 1103 # NOTE - In the common case of gene before CDS (and similar) with 1104 # the exact same locations, this will still maintain gene before CDS 1105 answer.features.sort(key=lambda x: x.location.start.position) 1106 if isinstance(annotations, dict): 1107 answer.annotations = annotations 1108 elif annotations: 1109 # Copy the old annotations, 1110 answer.annotations = self.annotations.copy() 1111 if isinstance(letter_annotations, dict): 1112 answer.letter_annotations = letter_annotations 1113 elif letter_annotations: 1114 # Copy the old per letter annotations, reversing them 1115 for key, value in self.letter_annotations.items(): 1116 answer._per_letter_annotations[key] = value[::-1] 1117 return answer
1118 1119 1120 if __name__ == "__main__": 1121 from Bio._utils import run_doctest 1122 run_doctest() 1123