Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9   
  10   
  11  from Bio._py3k import basestring 
  12   
  13  __docformat__ = "restructuredtext en"  # Simple markup to show doctests nicely 
  14   
  15  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  16  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  17  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  18  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  19   
  20   
21 -class _RestrictedDict(dict):
22 """Dict which only allows sequences of given length as values (PRIVATE). 23 24 This simple subclass of the Python dictionary is used in the SeqRecord 25 object for holding per-letter-annotations. This class is intended to 26 prevent simple errors by only allowing python sequences (e.g. lists, 27 strings and tuples) to be stored, and only if their length matches that 28 expected (the length of the SeqRecord's seq object). It cannot however 29 prevent the entries being edited in situ (for example appending entries 30 to a list). 31 32 >>> x = _RestrictedDict(5) 33 >>> x["test"] = "hello" 34 >>> x 35 {'test': 'hello'} 36 37 Adding entries which don't have the expected length are blocked: 38 39 >>> x["test"] = "hello world" 40 Traceback (most recent call last): 41 ... 42 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 43 44 The expected length is stored as a private attribute, 45 46 >>> x._length 47 5 48 49 In order that the SeqRecord (and other objects using this class) can be 50 pickled, for example for use in the multiprocessing library, we need to 51 be able to pickle the restricted dictionary objects. 52 53 Using the default protocol, which is 0 on Python 2.x, 54 55 >>> import pickle 56 >>> y = pickle.loads(pickle.dumps(x)) 57 >>> y 58 {'test': 'hello'} 59 >>> y._length 60 5 61 62 Using the highest protocol, which is 2 on Python 2.x, 63 64 >>> import pickle 65 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 66 >>> z 67 {'test': 'hello'} 68 >>> z._length 69 5 70 """ 71
72 - def __init__(self, length):
73 """Create an EMPTY restricted dictionary.""" 74 dict.__init__(self) 75 self._length = int(length)
76
77 - def __setitem__(self, key, value):
78 # The check hasattr(self, "_length") is to cope with pickle protocol 2 79 # I couldn't seem to avoid this with __getstate__ and __setstate__ 80 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 81 or (hasattr(self, "_length") and len(value) != self._length): 82 raise TypeError("We only allow python sequences (lists, tuples or " 83 "strings) of length %i." % self._length) 84 dict.__setitem__(self, key, value)
85
86 - def update(self, new_dict):
87 # Force this to go via our strict __setitem__ method 88 for (key, value) in new_dict.items(): 89 self[key] = value
90 91
92 -class SeqRecord(object):
93 """A SeqRecord object holds a sequence and information about it. 94 95 Main attributes: 96 - id - Identifier such as a locus tag (string) 97 - seq - The sequence itself (Seq object or similar) 98 99 Additional attributes: 100 - name - Sequence name, e.g. gene name (string) 101 - description - Additional text (string) 102 - dbxrefs - List of database cross references (list of strings) 103 - features - Any (sub)features defined (list of SeqFeature objects) 104 - annotations - Further information about the whole sequence (dictionary). 105 Most entries are strings, or lists of strings. 106 - letter_annotations - Per letter/symbol annotation (restricted 107 dictionary). This holds Python sequences (lists, strings 108 or tuples) whose length matches that of the sequence. 109 A typical use would be to hold a list of integers 110 representing sequencing quality scores, or a string 111 representing the secondary structure. 112 113 You will typically use Bio.SeqIO to read in sequences from files as 114 SeqRecord objects. However, you may want to create your own SeqRecord 115 objects directly (see the __init__ method for further details): 116 117 >>> from Bio.Seq import Seq 118 >>> from Bio.SeqRecord import SeqRecord 119 >>> from Bio.Alphabet import IUPAC 120 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 121 ... IUPAC.protein), 122 ... id="YP_025292.1", name="HokC", 123 ... description="toxic membrane protein") 124 >>> print(record) 125 ID: YP_025292.1 126 Name: HokC 127 Description: toxic membrane protein 128 Number of features: 0 129 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 130 131 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 132 for this. For the special case where you want the SeqRecord turned into 133 a string in a particular file format there is a format method which uses 134 Bio.SeqIO internally: 135 136 >>> print(record.format("fasta")) 137 >YP_025292.1 toxic membrane protein 138 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 139 <BLANKLINE> 140 141 You can also do things like slicing a SeqRecord, checking its length, etc 142 143 >>> len(record) 144 44 145 >>> edited = record[:10] + record[11:] 146 >>> print(edited.seq) 147 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 148 >>> print(record.seq) 149 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 150 151 """ 152
153 - def __init__(self, seq, id="<unknown id>", name="<unknown name>", 154 description="<unknown description>", dbxrefs=None, 155 features=None, annotations=None, 156 letter_annotations=None):
157 """Create a SeqRecord. 158 159 Arguments: 160 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 161 - id - Sequence identifier, recommended (string) 162 - name - Sequence name, optional (string) 163 - description - Sequence description, optional (string) 164 - dbxrefs - Database cross references, optional (list of strings) 165 - features - Any (sub)features, optional (list of SeqFeature objects) 166 - annotations - Dictionary of annotations for the whole sequence 167 - letter_annotations - Dictionary of per-letter-annotations, values 168 should be strings, list or tuples of the same 169 length as the full sequence. 170 171 You will typically use Bio.SeqIO to read in sequences from files as 172 SeqRecord objects. However, you may want to create your own SeqRecord 173 objects directly. 174 175 Note that while an id is optional, we strongly recommend you supply a 176 unique id string for each record. This is especially important 177 if you wish to write your sequences to a file. 178 179 If you don't have the actual sequence, but you do know its length, 180 then using the UnknownSeq object from Bio.Seq is appropriate. 181 182 You can create a 'blank' SeqRecord object, and then populate the 183 attributes later. 184 """ 185 if id is not None and not isinstance(id, basestring): 186 # Lots of existing code uses id=None... this may be a bad idea. 187 raise TypeError("id argument should be a string") 188 if not isinstance(name, basestring): 189 raise TypeError("name argument should be a string") 190 if not isinstance(description, basestring): 191 raise TypeError("description argument should be a string") 192 self._seq = seq 193 self.id = id 194 self.name = name 195 self.description = description 196 197 # database cross references (for the whole sequence) 198 if dbxrefs is None: 199 dbxrefs = [] 200 elif not isinstance(dbxrefs, list): 201 raise TypeError("dbxrefs argument should be a list (of strings)") 202 self.dbxrefs = dbxrefs 203 204 # annotations about the whole sequence 205 if annotations is None: 206 annotations = {} 207 elif not isinstance(annotations, dict): 208 raise TypeError("annotations argument should be a dict") 209 self.annotations = annotations 210 211 if letter_annotations is None: 212 # annotations about each letter in the sequence 213 if seq is None: 214 # Should we allow this and use a normal unrestricted dict? 215 self._per_letter_annotations = _RestrictedDict(length=0) 216 else: 217 try: 218 self._per_letter_annotations = \ 219 _RestrictedDict(length=len(seq)) 220 except: 221 raise TypeError("seq argument should be a Seq object or similar") 222 else: 223 # This will be handled via the property set function, which will 224 # turn this into a _RestrictedDict and thus ensure all the values 225 # in the dict are the right length 226 self.letter_annotations = letter_annotations 227 228 # annotations about parts of the sequence 229 if features is None: 230 features = [] 231 elif not isinstance(features, list): 232 raise TypeError("features argument should be a list (of SeqFeature objects)") 233 self.features = features
234 235 # TODO - Just make this a read only property?
236 - def _set_per_letter_annotations(self, value):
237 if not isinstance(value, dict): 238 raise TypeError("The per-letter-annotations should be a " 239 "(restricted) dictionary.") 240 # Turn this into a restricted-dictionary (and check the entries) 241 try: 242 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 243 except AttributeError: 244 # e.g. seq is None 245 self._per_letter_annotations = _RestrictedDict(length=0) 246 self._per_letter_annotations.update(value)
247 letter_annotations = property( 248 fget=lambda self: self._per_letter_annotations, 249 fset=_set_per_letter_annotations, 250 doc="""Dictionary of per-letter-annotation for the sequence. 251 252 For example, this can hold quality scores used in FASTQ or QUAL files. 253 Consider this example using Bio.SeqIO to read in an example Solexa 254 variant FASTQ file as a SeqRecord: 255 256 >>> from Bio import SeqIO 257 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 258 >>> print("%s %s" % (record.id, record.seq)) 259 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 260 >>> print(list(record.letter_annotations)) 261 ['solexa_quality'] 262 >>> print(record.letter_annotations["solexa_quality"]) 263 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 264 265 The letter_annotations get sliced automatically if you slice the 266 parent SeqRecord, for example taking the last ten bases: 267 268 >>> sub_record = record[-10:] 269 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 270 slxa_0001_1_0001_01 ACGTNNNNNN 271 >>> print(sub_record.letter_annotations["solexa_quality"]) 272 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 273 274 Any python sequence (i.e. list, tuple or string) can be recorded in 275 the SeqRecord's letter_annotations dictionary as long as the length 276 matches that of the SeqRecord's sequence. e.g. 277 278 >>> len(sub_record.letter_annotations) 279 1 280 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 281 >>> len(sub_record.letter_annotations) 282 2 283 284 You can delete entries from the letter_annotations dictionary as usual: 285 286 >>> del sub_record.letter_annotations["solexa_quality"] 287 >>> sub_record.letter_annotations 288 {'dummy': 'abcdefghij'} 289 290 You can completely clear the dictionary easily as follows: 291 292 >>> sub_record.letter_annotations = {} 293 >>> sub_record.letter_annotations 294 {} 295 """) 296
297 - def _set_seq(self, value):
298 # TODO - Add a deprecation warning that the seq should be write only? 299 if self._per_letter_annotations: 300 # TODO - Make this a warning? Silently empty the dictionary? 301 raise ValueError("You must empty the letter annotations first!") 302 self._seq = value 303 try: 304 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 305 except AttributeError: 306 # e.g. seq is None 307 self._per_letter_annotations = _RestrictedDict(length=0)
308 309 seq = property(fget=lambda self: self._seq, 310 fset=_set_seq, 311 doc="The sequence itself, as a Seq or MutableSeq object.") 312
313 - def __getitem__(self, index):
314 """Returns a sub-sequence or an individual letter. 315 316 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 317 that sub-sequence with approriate annotation preserved. The 318 name, id and description are kept. 319 320 Any per-letter-annotations are sliced to match the requested 321 sub-sequence. Unless a stride is used, all those features 322 which fall fully within the subsequence are included (with 323 their locations adjusted accordingly). 324 325 However, the annotations dictionary and the dbxrefs list are 326 not used for the new SeqRecord, as in general they may not 327 apply to the subsequence. If you want to preserve them, you 328 must explictly copy them to the new SeqRecord yourself. 329 330 Using an integer index, e.g. my_record[5] is shorthand for 331 extracting that letter from the sequence, my_record.seq[5]. 332 333 For example, consider this short protein and its secondary 334 structure as encoded by the PDB (e.g. H for alpha helices), 335 plus a simple feature for its histidine self phosphorylation 336 site: 337 338 >>> from Bio.Seq import Seq 339 >>> from Bio.SeqRecord import SeqRecord 340 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 341 >>> from Bio.Alphabet import IUPAC 342 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 343 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 344 ... IUPAC.protein), 345 ... id="1JOY", name="EnvZ", 346 ... description="Homodimeric domain of EnvZ from E. coli") 347 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 348 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 349 ... type = "Site")) 350 351 Now let's have a quick look at the full record, 352 353 >>> print(rec) 354 ID: 1JOY 355 Name: EnvZ 356 Description: Homodimeric domain of EnvZ from E. coli 357 Number of features: 1 358 Per letter annotation for: secondary_structure 359 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 360 >>> print(rec.letter_annotations["secondary_structure"]) 361 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 362 >>> print(rec.features[0].location) 363 [20:21] 364 365 Now let's take a sub sequence, here chosen as the first (fractured) 366 alpha helix which includes the histidine phosphorylation site: 367 368 >>> sub = rec[11:41] 369 >>> print(sub) 370 ID: 1JOY 371 Name: EnvZ 372 Description: Homodimeric domain of EnvZ from E. coli 373 Number of features: 1 374 Per letter annotation for: secondary_structure 375 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 376 >>> print(sub.letter_annotations["secondary_structure"]) 377 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 378 >>> print(sub.features[0].location) 379 [9:10] 380 381 You can also of course omit the start or end values, for 382 example to get the first ten letters only: 383 384 >>> print(rec[:10]) 385 ID: 1JOY 386 Name: EnvZ 387 Description: Homodimeric domain of EnvZ from E. coli 388 Number of features: 0 389 Per letter annotation for: secondary_structure 390 Seq('MAAGVKQLAD', IUPACProtein()) 391 392 Or for the last ten letters: 393 394 >>> print(rec[-10:]) 395 ID: 1JOY 396 Name: EnvZ 397 Description: Homodimeric domain of EnvZ from E. coli 398 Number of features: 0 399 Per letter annotation for: secondary_structure 400 Seq('IIEQFIDYLR', IUPACProtein()) 401 402 If you omit both, then you get a copy of the original record (although 403 lacking the annotations and dbxrefs): 404 405 >>> print(rec[:]) 406 ID: 1JOY 407 Name: EnvZ 408 Description: Homodimeric domain of EnvZ from E. coli 409 Number of features: 1 410 Per letter annotation for: secondary_structure 411 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 412 413 Finally, indexing with a simple integer is shorthand for pulling out 414 that letter from the sequence directly: 415 416 >>> rec[5] 417 'K' 418 >>> rec.seq[5] 419 'K' 420 """ 421 if isinstance(index, int): 422 # NOTE - The sequence level annotation like the id, name, etc 423 # do not really apply to a single character. However, should 424 # we try and expose any per-letter-annotation here? If so how? 425 return self.seq[index] 426 elif isinstance(index, slice): 427 if self.seq is None: 428 raise ValueError("If the sequence is None, we cannot slice it.") 429 parent_length = len(self) 430 answer = self.__class__(self.seq[index], 431 id=self.id, 432 name=self.name, 433 description=self.description) 434 # TODO - The desription may no longer apply. 435 # It would be safer to change it to something 436 # generic like "edited" or the default value. 437 438 # Don't copy the annotation dict and dbxefs list, 439 # they may not apply to a subsequence. 440 # answer.annotations = dict(self.annotations.items()) 441 # answer.dbxrefs = self.dbxrefs[:] 442 # TODO - Review this in light of adding SeqRecord objects? 443 444 # TODO - Cope with strides by generating ambiguous locations? 445 start, stop, step = index.indices(parent_length) 446 if step == 1: 447 # Select relevant features, add them with shifted locations 448 # assert str(self.seq)[index] == str(self.seq)[start:stop] 449 for f in self.features: 450 if f.ref or f.ref_db: 451 # TODO - Implement this (with lots of tests)? 452 import warnings 453 warnings.warn("When slicing SeqRecord objects, any " 454 "SeqFeature referencing other sequences (e.g. " 455 "from segmented GenBank records) are ignored.") 456 continue 457 if start <= f.location.nofuzzy_start \ 458 and f.location.nofuzzy_end <= stop: 459 answer.features.append(f._shift(-start)) 460 461 # Slice all the values to match the sliced sequence 462 # (this should also work with strides, even negative strides): 463 for key, value in self.letter_annotations.items(): 464 answer._per_letter_annotations[key] = value[index] 465 466 return answer 467 raise ValueError("Invalid index")
468
469 - def __iter__(self):
470 """Iterate over the letters in the sequence. 471 472 For example, using Bio.SeqIO to read in a protein FASTA file: 473 474 >>> from Bio import SeqIO 475 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 476 >>> for amino in record: 477 ... print(amino) 478 ... if amino == "L": break 479 X 480 A 481 G 482 L 483 >>> print(record.seq[3]) 484 L 485 486 This is just a shortcut for iterating over the sequence directly: 487 488 >>> for amino in record.seq: 489 ... print(amino) 490 ... if amino == "L": break 491 X 492 A 493 G 494 L 495 >>> print(record.seq[3]) 496 L 497 498 Note that this does not facilitate iteration together with any 499 per-letter-annotation. However, you can achieve that using the 500 python zip function on the record (or its sequence) and the relevant 501 per-letter-annotation: 502 503 >>> from Bio import SeqIO 504 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 505 >>> print("%s %s" % (rec.id, rec.seq)) 506 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 507 >>> print(list(rec.letter_annotations)) 508 ['solexa_quality'] 509 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 510 ... if qual > 35: 511 ... print("%s %i" % (nuc, qual)) 512 A 40 513 C 39 514 G 38 515 T 37 516 A 36 517 518 You may agree that using zip(rec.seq, ...) is more explicit than using 519 zip(rec, ...) as shown above. 520 """ 521 return iter(self.seq)
522
523 - def __contains__(self, char):
524 """Implements the 'in' keyword, searches the sequence. 525 526 e.g. 527 528 >>> from Bio import SeqIO 529 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 530 >>> "GAATTC" in record 531 False 532 >>> "AAA" in record 533 True 534 535 This essentially acts as a proxy for using "in" on the sequence: 536 537 >>> "GAATTC" in record.seq 538 False 539 >>> "AAA" in record.seq 540 True 541 542 Note that you can also use Seq objects as the query, 543 544 >>> from Bio.Seq import Seq 545 >>> from Bio.Alphabet import generic_dna 546 >>> Seq("AAA") in record 547 True 548 >>> Seq("AAA", generic_dna) in record 549 True 550 551 See also the Seq object's __contains__ method. 552 """ 553 return char in self.seq
554
555 - def __str__(self):
556 """A human readable summary of the record and its annotation (string). 557 558 The python built in function str works by calling the object's ___str__ 559 method. e.g. 560 561 >>> from Bio.Seq import Seq 562 >>> from Bio.SeqRecord import SeqRecord 563 >>> from Bio.Alphabet import IUPAC 564 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 565 ... IUPAC.protein), 566 ... id="YP_025292.1", name="HokC", 567 ... description="toxic membrane protein, small") 568 >>> print(str(record)) 569 ID: YP_025292.1 570 Name: HokC 571 Description: toxic membrane protein, small 572 Number of features: 0 573 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 574 575 In this example you don't actually need to call str explicity, as the 576 print command does this automatically: 577 578 >>> print(record) 579 ID: YP_025292.1 580 Name: HokC 581 Description: toxic membrane protein, small 582 Number of features: 0 583 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 584 585 Note that long sequences are shown truncated. 586 """ 587 lines = [] 588 if self.id: 589 lines.append("ID: %s" % self.id) 590 if self.name: 591 lines.append("Name: %s" % self.name) 592 if self.description: 593 lines.append("Description: %s" % self.description) 594 if self.dbxrefs: 595 lines.append("Database cross-references: " 596 + ", ".join(self.dbxrefs)) 597 lines.append("Number of features: %i" % len(self.features)) 598 for a in self.annotations: 599 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 600 if self.letter_annotations: 601 lines.append("Per letter annotation for: " 602 + ", ".join(self.letter_annotations)) 603 # Don't want to include the entire sequence, 604 # and showing the alphabet is useful: 605 lines.append(repr(self.seq)) 606 return "\n".join(lines)
607
608 - def __repr__(self):
609 """A concise summary of the record for debugging (string). 610 611 The python built in function repr works by calling the object's ___repr__ 612 method. e.g. 613 614 >>> from Bio.Seq import Seq 615 >>> from Bio.SeqRecord import SeqRecord 616 >>> from Bio.Alphabet import generic_protein 617 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 618 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 619 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 620 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 621 ... generic_protein), 622 ... id="NP_418483.1", name="b4059", 623 ... description="ssDNA-binding protein", 624 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 625 >>> print(repr(rec)) 626 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 627 628 At the python prompt you can also use this shorthand: 629 630 >>> rec 631 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 632 633 Note that long sequences are shown truncated. Also note that any 634 annotations, letter_annotations and features are not shown (as they 635 would lead to a very long string). 636 """ 637 return self.__class__.__name__ \ 638 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 639 % tuple(map(repr, (self.seq, self.id, self.name, 640 self.description, self.dbxrefs)))
641
642 - def format(self, format):
643 r"""Returns the record as a string in the specified file format. 644 645 The format should be a lower case string supported as an output 646 format by Bio.SeqIO, which is used to turn the SeqRecord into a 647 string. e.g. 648 649 >>> from Bio.Seq import Seq 650 >>> from Bio.SeqRecord import SeqRecord 651 >>> from Bio.Alphabet import IUPAC 652 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 653 ... IUPAC.protein), 654 ... id="YP_025292.1", name="HokC", 655 ... description="toxic membrane protein") 656 >>> record.format("fasta") 657 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 658 >>> print(record.format("fasta")) 659 >YP_025292.1 toxic membrane protein 660 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 661 <BLANKLINE> 662 663 The python print command automatically appends a new line, meaning 664 in this example a blank line is shown. If you look at the string 665 representation you can see there is a trailing new line (shown as 666 slash n) which is important when writing to a file or if 667 concatenating multiple sequence strings together. 668 669 Note that this method will NOT work on every possible file format 670 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 671 """ 672 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 673 # See also the Bio.Align.Generic.Alignment class and its format() 674 return self.__format__(format)
675
676 - def __format__(self, format_spec):
677 """Returns the record as a string in the specified file format. 678 679 This method supports the python format() function added in 680 Python 2.6/3.0. The format_spec should be a lower case string 681 supported by Bio.SeqIO as an output file format. See also the 682 SeqRecord's format() method. 683 684 Under Python 3 please note that for binary formats a bytes 685 string is returned, otherwise a (unicode) string is returned. 686 """ 687 if not format_spec: 688 # Follow python convention and default to using __str__ 689 return str(self) 690 from Bio import SeqIO 691 if format_spec in SeqIO._BinaryFormats: 692 # Return bytes on Python 3 693 from io import BytesIO 694 handle = BytesIO() 695 else: 696 from Bio._py3k import StringIO 697 handle = StringIO() 698 SeqIO.write(self, handle, format_spec) 699 return handle.getvalue()
700
701 - def __len__(self):
702 """Returns the length of the sequence. 703 704 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 705 706 >>> from Bio import SeqIO 707 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 708 >>> len(record) 709 309 710 >>> len(record.seq) 711 309 712 """ 713 return len(self.seq)
714 715 # Python 3:
716 - def __bool__(self):
717 """Boolean value of an instance of this class (True). 718 719 This behaviour is for backwards compatibility, since until the 720 __len__ method was added, a SeqRecord always evaluated as True. 721 722 Note that in comparison, a Seq object will evaluate to False if it 723 has a zero length sequence. 724 725 WARNING: The SeqRecord may in future evaluate to False when its 726 sequence is of zero length (in order to better match the Seq 727 object behaviour)! 728 """ 729 return True
730 731 # Python 2: 732 __nonzero__ = __bool__ 733
734 - def __add__(self, other):
735 """Add another sequence or string to this sequence. 736 737 The other sequence can be a SeqRecord object, a Seq object (or 738 similar, e.g. a MutableSeq) or a plain Python string. If you add 739 a plain string or a Seq (like) object, the new SeqRecord will simply 740 have this appended to the existing data. However, any per letter 741 annotation will be lost: 742 743 >>> from Bio import SeqIO 744 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 745 >>> print("%s %s" % (record.id, record.seq)) 746 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 747 >>> print(list(record.letter_annotations)) 748 ['solexa_quality'] 749 750 >>> new = record + "ACT" 751 >>> print("%s %s" % (new.id, new.seq)) 752 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 753 >>> print(list(new.letter_annotations)) 754 [] 755 756 The new record will attempt to combine the annotation, but for any 757 ambiguities (e.g. different names) it defaults to omitting that 758 annotation. 759 760 >>> from Bio import SeqIO 761 >>> with open("GenBank/pBAD30.gb") as handle: 762 ... plasmid = SeqIO.read(handle, "gb") 763 >>> print("%s %i" % (plasmid.id, len(plasmid))) 764 pBAD30 4923 765 766 Now let's cut the plasmid into two pieces, and join them back up the 767 other way round (i.e. shift the starting point on this plasmid, have 768 a look at the annotated features in the original file to see why this 769 particular split point might make sense): 770 771 >>> left = plasmid[:3765] 772 >>> right = plasmid[3765:] 773 >>> new = right + left 774 >>> print("%s %i" % (new.id, len(new))) 775 pBAD30 4923 776 >>> str(new.seq) == str(right.seq + left.seq) 777 True 778 >>> len(new.features) == len(left.features) + len(right.features) 779 True 780 781 When we add the left and right SeqRecord objects, their annotation 782 is all consistent, so it is all conserved in the new SeqRecord: 783 784 >>> new.id == left.id == right.id == plasmid.id 785 True 786 >>> new.name == left.name == right.name == plasmid.name 787 True 788 >>> new.description == plasmid.description 789 True 790 >>> new.annotations == left.annotations == right.annotations 791 True 792 >>> new.letter_annotations == plasmid.letter_annotations 793 True 794 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 795 True 796 797 However, we should point out that when we sliced the SeqRecord, 798 any annotations dictionary or dbxrefs list entries were lost. 799 You can explicitly copy them like this: 800 801 >>> new.annotations = plasmid.annotations.copy() 802 >>> new.dbxrefs = plasmid.dbxrefs[:] 803 """ 804 if not isinstance(other, SeqRecord): 805 # Assume it is a string or a Seq. 806 # Note can't transfer any per-letter-annotations 807 return SeqRecord(self.seq + other, 808 id=self.id, name=self.name, 809 description=self.description, 810 features=self.features[:], 811 annotations=self.annotations.copy(), 812 dbxrefs=self.dbxrefs[:]) 813 # Adding two SeqRecord objects... must merge annotation. 814 answer = SeqRecord(self.seq + other.seq, 815 features=self.features[:], 816 dbxrefs=self.dbxrefs[:]) 817 # Will take all the features and all the db cross refs, 818 l = len(self) 819 for f in other.features: 820 answer.features.append(f._shift(l)) 821 del l 822 for ref in other.dbxrefs: 823 if ref not in answer.dbxrefs: 824 answer.dbxrefs.append(ref) 825 # Take common id/name/description/annotation 826 if self.id == other.id: 827 answer.id = self.id 828 if self.name == other.name: 829 answer.name = self.name 830 if self.description == other.description: 831 answer.description = self.description 832 for k, v in self.annotations.items(): 833 if k in other.annotations and other.annotations[k] == v: 834 answer.annotations[k] = v 835 # Can append matching per-letter-annotation 836 for k, v in self.letter_annotations.items(): 837 if k in other.letter_annotations: 838 answer.letter_annotations[k] = v + other.letter_annotations[k] 839 return answer
840
841 - def __radd__(self, other):
842 """Add another sequence or string to this sequence (from the left). 843 844 This method handles adding a Seq object (or similar, e.g. MutableSeq) 845 or a plain Python string (on the left) to a SeqRecord (on the right). 846 See the __add__ method for more details, but for example: 847 848 >>> from Bio import SeqIO 849 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 850 >>> print("%s %s" % (record.id, record.seq)) 851 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 852 >>> print(list(record.letter_annotations)) 853 ['solexa_quality'] 854 855 >>> new = "ACT" + record 856 >>> print("%s %s" % (new.id, new.seq)) 857 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 858 >>> print(list(new.letter_annotations)) 859 [] 860 """ 861 if isinstance(other, SeqRecord): 862 raise RuntimeError("This should have happened via the __add__ of " 863 "the other SeqRecord being added!") 864 # Assume it is a string or a Seq. 865 # Note can't transfer any per-letter-annotations 866 offset = len(other) 867 return SeqRecord(other + self.seq, 868 id=self.id, name=self.name, 869 description=self.description, 870 features=[f._shift(offset) for f in self.features], 871 annotations=self.annotations.copy(), 872 dbxrefs=self.dbxrefs[:])
873
874 - def upper(self):
875 """Returns a copy of the record with an upper case sequence. 876 877 All the annotation is preserved unchanged. e.g. 878 879 >>> from Bio.Alphabet import generic_dna 880 >>> from Bio.Seq import Seq 881 >>> from Bio.SeqRecord import SeqRecord 882 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 883 ... description = "Made up for this example") 884 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 885 >>> print(record.upper().format("fastq")) 886 @Test Made up for this example 887 ACGTACGT 888 + 889 "#$%&'() 890 <BLANKLINE> 891 892 Naturally, there is a matching lower method: 893 894 >>> print(record.lower().format("fastq")) 895 @Test Made up for this example 896 acgtacgt 897 + 898 "#$%&'() 899 <BLANKLINE> 900 """ 901 return SeqRecord(self.seq.upper(), 902 id=self.id, name=self.name, 903 description=self.description, 904 dbxrefs=self.dbxrefs[:], 905 features=self.features[:], 906 annotations=self.annotations.copy(), 907 letter_annotations=self.letter_annotations.copy())
908
909 - def lower(self):
910 """Returns a copy of the record with a lower case sequence. 911 912 All the annotation is preserved unchanged. e.g. 913 914 >>> from Bio import SeqIO 915 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 916 >>> print(record.format("fasta")) 917 >gi|3298468|dbj|BAA31520.1| SAMIPF 918 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 919 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 920 <BLANKLINE> 921 >>> print(record.lower().format("fasta")) 922 >gi|3298468|dbj|BAA31520.1| SAMIPF 923 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 924 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 925 <BLANKLINE> 926 927 To take a more annotation rich example, 928 929 >>> from Bio import SeqIO 930 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 931 >>> len(old.features) 932 3 933 >>> new = old.lower() 934 >>> len(old.features) == len(new.features) 935 True 936 >>> old.annotations["organism"] == new.annotations["organism"] 937 True 938 >>> old.dbxrefs == new.dbxrefs 939 True 940 """ 941 return SeqRecord(self.seq.lower(), 942 id=self.id, name=self.name, 943 description=self.description, 944 dbxrefs=self.dbxrefs[:], 945 features=self.features[:], 946 annotations=self.annotations.copy(), 947 letter_annotations=self.letter_annotations.copy())
948
949 - def reverse_complement(self, id=False, name=False, description=False, 950 features=True, annotations=False, 951 letter_annotations=True, dbxrefs=False):
952 """Returns new SeqRecord with reverse complement sequence. 953 954 You can specify the returned record's id, name and description as 955 strings, or True to keep that of the parent, or False for a default. 956 957 You can specify the returned record's features with a list of 958 SeqFeature objects, or True to keep that of the parent, or False to 959 omit them. The default is to keep the original features (with the 960 strand and locations adjusted). 961 962 You can also specify both the returned record's annotations and 963 letter_annotations as dictionaries, True to keep that of the parent, 964 or False to omit them. The default is to keep the original 965 annotations (with the letter annotations reversed). 966 967 To show what happens to the pre-letter annotations, consider an 968 example Solexa variant FASTQ file with a single entry, which we'll 969 read in as a SeqRecord: 970 971 >>> from Bio import SeqIO 972 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 973 >>> print("%s %s" % (record.id, record.seq)) 974 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 975 >>> print(list(record.letter_annotations)) 976 ['solexa_quality'] 977 >>> print(record.letter_annotations["solexa_quality"]) 978 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 979 980 Now take the reverse complement, 981 982 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 983 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 984 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 985 986 Notice that the per-letter-annotations have also been reversed, 987 although this may not be appropriate for all cases. 988 989 >>> print(rc_record.letter_annotations["solexa_quality"]) 990 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 991 992 Now for the features, we need a different example. Parsing a GenBank 993 file is probably the easiest way to get an nice example with features 994 in it... 995 996 >>> from Bio import SeqIO 997 >>> with open("GenBank/pBAD30.gb") as handle: 998 ... plasmid = SeqIO.read(handle, "gb") 999 >>> print("%s %i" % (plasmid.id, len(plasmid))) 1000 pBAD30 4923 1001 >>> plasmid.seq 1002 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1003 >>> len(plasmid.features) 1004 13 1005 1006 Now, let's take the reverse complement of this whole plasmid: 1007 1008 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1009 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1010 pBAD30_rc 4923 1011 >>> rc_plasmid.seq 1012 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1013 >>> len(rc_plasmid.features) 1014 13 1015 1016 Let's compare the first CDS feature - it has gone from being the 1017 second feature (index 1) to the second last feature (index -2), its 1018 strand has changed, and the location switched round. 1019 1020 >>> print(plasmid.features[1]) 1021 type: CDS 1022 location: [1081:1960](-) 1023 qualifiers: 1024 Key: label, Value: ['araC'] 1025 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1026 Key: vntifkey, Value: ['4'] 1027 <BLANKLINE> 1028 >>> print(rc_plasmid.features[-2]) 1029 type: CDS 1030 location: [2963:3842](+) 1031 qualifiers: 1032 Key: label, Value: ['araC'] 1033 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1034 Key: vntifkey, Value: ['4'] 1035 <BLANKLINE> 1036 1037 You can check this new location, based on the length of the plasmid: 1038 1039 >>> len(plasmid) - 1081 1040 3842 1041 >>> len(plasmid) - 1960 1042 2963 1043 1044 Note that if the SeqFeature annotation includes any strand specific 1045 information (e.g. base changes for a SNP), this information is not 1046 ammended, and would need correction after the reverse complement. 1047 1048 Note trying to reverse complement a protein SeqRecord raises an 1049 exception: 1050 1051 >>> from Bio.SeqRecord import SeqRecord 1052 >>> from Bio.Seq import Seq 1053 >>> from Bio.Alphabet import IUPAC 1054 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1055 >>> protein_rec.reverse_complement() 1056 Traceback (most recent call last): 1057 ... 1058 ValueError: Proteins do not have complements! 1059 1060 Also note you can reverse complement a SeqRecord using a MutableSeq: 1061 1062 >>> from Bio.SeqRecord import SeqRecord 1063 >>> from Bio.Seq import MutableSeq 1064 >>> from Bio.Alphabet import generic_dna 1065 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1066 >>> rec.seq[0] = "T" 1067 >>> print("%s %s" % (rec.id, rec.seq)) 1068 Test TCGT 1069 >>> rc = rec.reverse_complement(id=True) 1070 >>> print("%s %s" % (rc.id, rc.seq)) 1071 Test ACGA 1072 """ 1073 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1074 if isinstance(self.seq, MutableSeq): 1075 # Currently the MutableSeq reverse complement is in situ 1076 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1077 else: 1078 answer = SeqRecord(self.seq.reverse_complement()) 1079 if isinstance(id, basestring): 1080 answer.id = id 1081 elif id: 1082 answer.id = self.id 1083 if isinstance(name, basestring): 1084 answer.name = name 1085 elif name: 1086 answer.name = self.name 1087 if isinstance(description, basestring): 1088 answer.description = description 1089 elif description: 1090 answer.description = self.description 1091 if isinstance(dbxrefs, list): 1092 answer.dbxrefs = dbxrefs 1093 elif dbxrefs: 1094 # Copy the old dbxrefs 1095 answer.dbxrefs = self.dbxrefs[:] 1096 if isinstance(features, list): 1097 answer.features = features 1098 elif features: 1099 # Copy the old features, adjusting location and string 1100 l = len(answer) 1101 answer.features = [f._flip(l) for f in self.features] 1102 # The old list should have been sorted by start location, 1103 # reversing it will leave it sorted by what is now the end position, 1104 # so we need to resort in case of overlapping features. 1105 # NOTE - In the common case of gene before CDS (and similar) with 1106 # the exact same locations, this will still maintain gene before CDS 1107 answer.features.sort(key=lambda x: x.location.start.position) 1108 if isinstance(annotations, dict): 1109 answer.annotations = annotations 1110 elif annotations: 1111 # Copy the old annotations, 1112 answer.annotations = self.annotations.copy() 1113 if isinstance(letter_annotations, dict): 1114 answer.letter_annotations = letter_annotations 1115 elif letter_annotations: 1116 # Copy the old per letter annotations, reversing them 1117 for key, value in self.letter_annotations.items(): 1118 answer._per_letter_annotations[key] = value[::-1] 1119 return answer
1120 1121 1122 if __name__ == "__main__": 1123 from Bio._utils import run_doctest 1124 run_doctest() 1125